zoukankan      html  css  js  c++  java
  • 某写真网站爬虫

    写了一个很粗糙的某写真网站的小爬虫,有空改改

    from selenium import webdriver
    import re
    import requests
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from pyquery import PyQuery as pq
    from selenium.webdriver.firefox.options import Options
    
    url = 'http://www.tujidao.com/a/?id=25309'
    
    PhantomJS_conf = ['--load-images=false','--disk-cache=false']  # 浏览器不加载图片,不开启缓存
    
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    browser = webdriver.Firefox(firefox_options=options)
    
    # browser = webdriver.PhantomJS(service_args=PhantomJS_conf)
    # browser.set_window_size(1400,900)                               # 设置浏览器窗口大小
    wait = WebDriverWait(browser,10)
    
    def login():
        browser.get(url)
        # 输入账号
        int_user = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(1) > div:nth-child(2) > input:nth-child(1)')))
        # 输入密码
        int_pass = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div.layui-form-item:nth-child(2) > div:nth-child(2) > input:nth-child(1)')))
    
        # 登陆按钮
        log = wait.until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, '.layui-btn')))
        int_user.send_keys(int())
        int_pass.send_keys(int())
        log.click()
        browser.get(url)
        return browser.page_source
    
    def get_image():
        # wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.footer')))
        html2 = login()
        doc = pq(html2)
        item = doc('#kbox img')        # 找到ID
        # print(item)
        return item
    
    def register():
        html3 = get_image()
        c1 = re.compile('<img.*?data-src="(.*?)"/>',re.S)
        c2 = re.findall(c1,str(html3))
        return c2
    
    count = 0
    for i in register():
        response = requests.get(i)
        dir = r'C:UsersadminDesktop	esta'
        # print(response.content)
        with open(dir+'{}'.format(count)+'.jpg',mode='wb') as f:
            count += 1
            f.write(response.content)
  • 相关阅读:
    LINQ 详解
    oracle下查询的sql已经超出IIS响应时间
    IOC应用之 Ninject
    JSONP ---------跨域
    国内各大互联网公司相关技术站点2.0版 (集合腾讯、阿里、百度、搜狐、新浪、360等共49个)
    IO多路复用,以socket为例
    socket机制下实现的多用户与服务器交互
    在一个进程中定义多个线程
    基于tcp的socketserver,即tcp的多线程
    基于upd的socketserver,即udp的多线程
  • 原文地址:https://www.cnblogs.com/jiuyachun/p/11284311.html
Copyright © 2011-2022 走看看