zoukankan      html  css  js  c++  java
  • selenium实现百度图片爬取

    因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler)

    好了直接上代码,

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 import requests,time
     4 from queue import Queue
     5 from urllib import request
     6 import os,gevent
     7 from lxml import etree
     8 
     9 
    10 
    11 
    12 def get_img(html):
    13     html = html.get()
    14 
    15     html = etree.HTML(html)
    16 
    17     img_url = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl')
    18     # print(img_url)
    19     path = './baidupic/'
    20     if not os.path.exists(path):
    21         os.makedirs(path)
    22 
    23     for url in img_url:
    24         print(url)
    25         # response = requests.get(url)
    26         # img = response.content
    27         try:
    28             fname = url.split('/')[-1]
    29             request.urlretrieve(url,os.path.join(path, fname))
    30             print('下载成功')
    31         except:
    32             print('图片不存在')
    33 
    34 
    35 def get_page():
    36     #创建数据队列
    37     q = Queue()
    38 
    39     #百度图片搜索地址
    40     base_url = 'https://image.baidu.com/'
    41     #返回浏览器对象
    42     browser = webdriver.Chrome(executable_path=r'C:UserszhaozhiDesktopchromedriver.exe')
    43     #模拟访问
    44     browser.get(base_url)
    45     #输入搜索关键字
    46     browser.find_element_by_id('kw').send_keys('美女')
    47     #按键
    48     browser.find_element_by_class_name('s_search').click()
    49     # time.sleep(2)
    50     for i in range(10):
    51         browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    52         # time.sleep(2)
    53         # html = browser.page_source
    54 
    55 
    56         q.put(browser.page_source)
    57     # browser.close()
    58     # print(browser.page_source)
    59     g_list=[]
    60     for i  in range(20):
    61         g= gevent.spawn(get_img,q)
    62         g_list.append(g)
    63 
    64     gevent.joinall(g_list)
    65 
    66 
    67 
    68 
    69 
    70 
    71 
    72 
    73 # browser.save_screenshot('baidupic.png')
    74 # print(browser.page_source)
    75 # browser.find_element(By_)
    76 
    77 if __name__ == '__main__':
    78     get_page()
  • 相关阅读:
    OpenShift和F5的集成手册
    OpenShift负载分区策略(Router Shading)
    Istio在Openshift 3.11的安装
    Openshift 和Harbor的集成
    OpenShift 如何获取bearer Token以便进行各种API调用
    Openshift 3.11和LDAP的集成
    Openshift 节点添加和删除
    Spring Dataflow批处理框架在OCP上的部署
    Openshift 用户,角色和RBAC
    取消Windows server 关机提示备注的方法
  • 原文地址:https://www.cnblogs.com/lyxdw/p/9231515.html
Copyright © 2011-2022 走看看