zoukankan      html  css  js  c++  java
  • selenium+chrome浏览器驱动-爬取百度图片

    百度图片网页中中,当页面滚动到底部,页面会加载新的内容。

    我们通过selenium和谷歌浏览器驱动,执行js,是浏览器不断加载页面,通过抓取页面的图片路径来下载图片。

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 from selenium.webdriver.support import expected_conditions as EC
     4 from selenium.webdriver.support.ui import WebDriverWait
     5 import requests
     6 from lxml import etree
     7 import time
     8 import random
     9 import os
    10 '''
    11 爬取百度图片,页面向下拉到底,会加载新的网页数据。
    12 
    13 '''
    14 
    15 # 构建请求头
    16 headers = {
    17 "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    18 "Accept-Encoding":"gzip, deflate, br",
    19 "Accept-Language":"zh-CN,zh;q=0.9",
    20 "Cache-Control":"max-age=0",
    21 "Connection":"keep-alive",
    22 "Cookie":"winWH=%5E6_1197x581; BDIMGISLOGIN=0; BDqhfp=%E5%9B%BE%E7%89%87%26%260-10-1undefined%26%260%26%261; BIDUPSID=24942ACBA645FE0108AF48B5C2509013; BAIDUID=C05587CE8C62CAB17300AA09BC6820BD:FG=1; PSTM=1528274179; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1440_25810_26459_21103_18559_20928; BDUSS=VNneDRnWTQ3fnVQOWJpTG95Z1RZVnllVzlRSURpWnBMWHlwbGZha2lGZWl3VlpiQUFBQUFBJCQAAAAAAAAAAAEAAAB9W1Rr1MbFzNGnzt7Wub6zAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKI0L1uiNC9bW; PSINO=3; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; cflag=15%3A3; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; indexPageSugList=%5B%22%E5%9B%BE%E7%89%87%22%5D; cleanHistoryStatus=0",
    23 
    24 "Referer":"http://image.baidu.com/",
    25 "Upgrade-Insecure-Requests":"1",
    26 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
    27 }
    28 # 创建浏览器对象
    29 browser = webdriver.Chrome(executable_path=r'E:PycharmProjectspachongchromedriver.exe')
    30 # 设置加载超时时间
    31 wait = WebDriverWait(browser,20)
    32 # 发送请求
    33 browser.get('https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%9B%BE%E7%89%87&oq=%E5%9B%BE%E7%89%87&rsp=-1')
    34 
    35 # 设置图片下载路径
    36 path = './baidupic/'
    37 if not os.path.exists(path):
    38     os.makedirs(path)
    39 
    40 while True:
    41     # 直到网页中的图片最后一个div加载成功。(每次加载新数据都是则将一个imgpaged的div)
    42     wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="imgid"]/div[last()]')))
    43     # 获取网页源
    44     html = browser.page_source
    45     html = etree.HTML(html)
    46     # 获取图片的url
    47     # img_urls = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl') #大图
    48     img_urls = html.xpath('//div[@id="imgid"]/div[last()]//img/@data-imgurl') #小图
    49     # print(img_url)
    50     for img_url in img_urls:
    51         #获取图片名字.(直接按原名字存储,防止重名)
    52         fname = img_url.split('/')[-1]
    53         try:
    54             response = requests.get(img_url,headers=headers)
    55             data = response.content
    56             with open('./baidupic/'+fname,mode='wb') as f:
    57                 f.write(data)
    58         except:
    59             print(img_url,'下载失败')
    60 
    61         # 防止请求过快,这里是单线程下载图片本身需要一定时间,先注释掉
    62         # time.sleep(2+ random.random()*1)
    63 
    64     # 将页面滚动底,加载新数据(执行js)
    65     browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    66     # 页面加载需要时间
    67     time.sleep(5+ random.random()*1)
    68 
    69     # break

    请求头headers中的内容源于浏览器的审查。删除了Host内容,百度的有些大图来源于其他网站,如果设置Host,一些大图可能不能下载。

    在网页源码中发现,图片有大图,有小图,路径不同。

  • 相关阅读:
    CodeForces gym Nasta Rabbara lct
    bzoj 4025 二分图 lct
    CodeForces 785E Anton and Permutation
    bzoj 3669 魔法森林
    模板汇总——快读 fread
    bzoj2049 Cave 洞穴勘测 lct
    bzoj 2002 弹飞绵羊 lct裸题
    HDU 6394 Tree 分块 || lct
    HDU 6364 Ringland
    nyoj221_Tree_subsequent_traversal
  • 原文地址:https://www.cnblogs.com/doitjust/p/9222118.html
Copyright © 2011-2022 走看看