zoukankan      html  css  js  c++  java
  • selenium+chrome浏览器驱动-爬取百度图片

    百度图片网页中中,当页面滚动到底部,页面会加载新的内容。

    我们通过selenium和谷歌浏览器驱动,执行js,是浏览器不断加载页面,通过抓取页面的图片路径来下载图片。

     1 from selenium import webdriver
     2 from selenium.webdriver.common.by import By
     3 from selenium.webdriver.support import expected_conditions as EC
     4 from selenium.webdriver.support.ui import WebDriverWait
     5 import requests
     6 from lxml import etree
     7 import time
     8 import random
     9 import os
    10 '''
    11 爬取百度图片,页面向下拉到底,会加载新的网页数据。
    12 
    13 '''
    14 
    15 # 构建请求头
    16 headers = {
    17 "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    18 "Accept-Encoding":"gzip, deflate, br",
    19 "Accept-Language":"zh-CN,zh;q=0.9",
    20 "Cache-Control":"max-age=0",
    21 "Connection":"keep-alive",
    22 "Cookie":"winWH=%5E6_1197x581; BDIMGISLOGIN=0; BDqhfp=%E5%9B%BE%E7%89%87%26%260-10-1undefined%26%260%26%261; BIDUPSID=24942ACBA645FE0108AF48B5C2509013; BAIDUID=C05587CE8C62CAB17300AA09BC6820BD:FG=1; PSTM=1528274179; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1440_25810_26459_21103_18559_20928; BDUSS=VNneDRnWTQ3fnVQOWJpTG95Z1RZVnllVzlRSURpWnBMWHlwbGZha2lGZWl3VlpiQUFBQUFBJCQAAAAAAAAAAAEAAAB9W1Rr1MbFzNGnzt7Wub6zAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKI0L1uiNC9bW; PSINO=3; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; cflag=15%3A3; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; indexPageSugList=%5B%22%E5%9B%BE%E7%89%87%22%5D; cleanHistoryStatus=0",
    23 
    24 "Referer":"http://image.baidu.com/",
    25 "Upgrade-Insecure-Requests":"1",
    26 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
    27 }
    28 # 创建浏览器对象
    29 browser = webdriver.Chrome(executable_path=r'E:PycharmProjectspachongchromedriver.exe')
    30 # 设置加载超时时间
    31 wait = WebDriverWait(browser,20)
    32 # 发送请求
    33 browser.get('https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%9B%BE%E7%89%87&oq=%E5%9B%BE%E7%89%87&rsp=-1')
    34 
    35 # 设置图片下载路径
    36 path = './baidupic/'
    37 if not os.path.exists(path):
    38     os.makedirs(path)
    39 
    40 while True:
    41     # 直到网页中的图片最后一个div加载成功。(每次加载新数据都是则将一个imgpaged的div)
    42     wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="imgid"]/div[last()]')))
    43     # 获取网页源
    44     html = browser.page_source
    45     html = etree.HTML(html)
    46     # 获取图片的url
    47     # img_urls = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl') #大图
    48     img_urls = html.xpath('//div[@id="imgid"]/div[last()]//img/@data-imgurl') #小图
    49     # print(img_url)
    50     for img_url in img_urls:
    51         #获取图片名字.(直接按原名字存储,防止重名)
    52         fname = img_url.split('/')[-1]
    53         try:
    54             response = requests.get(img_url,headers=headers)
    55             data = response.content
    56             with open('./baidupic/'+fname,mode='wb') as f:
    57                 f.write(data)
    58         except:
    59             print(img_url,'下载失败')
    60 
    61         # 防止请求过快,这里是单线程下载图片本身需要一定时间,先注释掉
    62         # time.sleep(2+ random.random()*1)
    63 
    64     # 将页面滚动底,加载新数据(执行js)
    65     browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    66     # 页面加载需要时间
    67     time.sleep(5+ random.random()*1)
    68 
    69     # break

    请求头headers中的内容源于浏览器的审查。删除了Host内容,百度的有些大图来源于其他网站,如果设置Host,一些大图可能不能下载。

    在网页源码中发现,图片有大图,有小图,路径不同。

  • 相关阅读:
    SP笔记:交叉实现七行并成一行
    HTML tag 学习
    操作哈希表
    Efficient bipedal robots based on passivedynamic walkers
    Pushing People Around
    ZEROMOMENT PONTTHIRTY FIVE YEARS OF ITS LIFE

    Active Learning for RealTime Motion Controllers
    Accelerometerbased User Interfaces for the Control of a Physically Simulated Character
    Dynamic Response for Motion Capture Animation
  • 原文地址:https://www.cnblogs.com/doitjust/p/9222118.html
Copyright © 2011-2022 走看看