百度图片网页中中,当页面滚动到底部,页面会加载新的内容。
我们通过selenium和谷歌浏览器驱动,执行js,是浏览器不断加载页面,通过抓取页面的图片路径来下载图片。
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support import expected_conditions as EC 4 from selenium.webdriver.support.ui import WebDriverWait 5 import requests 6 from lxml import etree 7 import time 8 import random 9 import os 10 ''' 11 爬取百度图片,页面向下拉到底,会加载新的网页数据。 12 13 ''' 14 15 # 构建请求头 16 headers = { 17 "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 18 "Accept-Encoding":"gzip, deflate, br", 19 "Accept-Language":"zh-CN,zh;q=0.9", 20 "Cache-Control":"max-age=0", 21 "Connection":"keep-alive", 22 "Cookie":"winWH=%5E6_1197x581; BDIMGISLOGIN=0; BDqhfp=%E5%9B%BE%E7%89%87%26%260-10-1undefined%26%260%26%261; BIDUPSID=24942ACBA645FE0108AF48B5C2509013; BAIDUID=C05587CE8C62CAB17300AA09BC6820BD:FG=1; PSTM=1528274179; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1440_25810_26459_21103_18559_20928; BDUSS=VNneDRnWTQ3fnVQOWJpTG95Z1RZVnllVzlRSURpWnBMWHlwbGZha2lGZWl3VlpiQUFBQUFBJCQAAAAAAAAAAAEAAAB9W1Rr1MbFzNGnzt7Wub6zAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKI0L1uiNC9bW; PSINO=3; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; cflag=15%3A3; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; indexPageSugList=%5B%22%E5%9B%BE%E7%89%87%22%5D; cleanHistoryStatus=0", 23 24 "Referer":"http://image.baidu.com/", 25 "Upgrade-Insecure-Requests":"1", 26 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" 27 } 28 # 创建浏览器对象 29 browser = webdriver.Chrome(executable_path=r'E:PycharmProjectspachongchromedriver.exe') 30 # 设置加载超时时间 31 wait = WebDriverWait(browser,20) 32 # 发送请求 33 browser.get('https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%9B%BE%E7%89%87&oq=%E5%9B%BE%E7%89%87&rsp=-1') 34 35 # 设置图片下载路径 36 path = './baidupic/' 37 if not os.path.exists(path): 38 os.makedirs(path) 39 40 while True: 41 # 直到网页中的图片最后一个div加载成功。(每次加载新数据都是则将一个imgpaged的div) 42 wait.until(EC.presence_of_all_elements_located((By.XPATH,'//div[@id="imgid"]/div[last()]'))) 43 # 获取网页源 44 html = browser.page_source 45 html = etree.HTML(html) 46 # 获取图片的url 47 # img_urls = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl') #大图 48 img_urls = html.xpath('//div[@id="imgid"]/div[last()]//img/@data-imgurl') #小图 49 # print(img_url) 50 for img_url in img_urls: 51 #获取图片名字.(直接按原名字存储,防止重名) 52 fname = img_url.split('/')[-1] 53 try: 54 response = requests.get(img_url,headers=headers) 55 data = response.content 56 with open('./baidupic/'+fname,mode='wb') as f: 57 f.write(data) 58 except: 59 print(img_url,'下载失败') 60 61 # 防止请求过快,这里是单线程下载图片本身需要一定时间,先注释掉 62 # time.sleep(2+ random.random()*1) 63 64 # 将页面滚动底,加载新数据(执行js) 65 browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') 66 # 页面加载需要时间 67 time.sleep(5+ random.random()*1) 68 69 # break
请求头headers中的内容源于浏览器的审查。删除了Host内容,百度的有些大图来源于其他网站,如果设置Host,一些大图可能不能下载。
在网页源码中发现,图片有大图,有小图,路径不同。