爬取百度百科上图片
# -*- coding:utf-8 -*- import urllib3 as ul import urllib3 import certifi import urllib3.contrib.pyopenssl as pyopenssl from bs4 import BeautifulSoup as bs import time #find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果. link = 'https://baike.baidu.com/item/%E5%8F%A4%E5%A4%A9%E4%B9%90/107686?fr=aladdin' #先拿到图片链接,存到列表,再一个个下载 def crawler(link):#主函数 html = get_html(link)#返回获取到的HTML (piclinks, filename) = get_pic_link(html)#获取含有图片链接img列表及图片名 print piclinks # download i = 0 for link in piclinks[:]: download(link, filename + str(i) + '.jpg') i += 1 # time.sleep(10) def get_html(link): #主要建立连接并取得整个HTML pyopenssl.inject_into_urllib3() http = ul.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())#首先,为何要使用连接池?每发起一个独立的请求TCP就要经过三次握手的过程,而通过重用已经连接过的socket(HTTP1.1支持),连接过程将会减少服务器端资源的占用,应答速度也更快。 r = http.request('GET', link) return r.data def get_pic_link(html): #根据取到的html用beautiful soap定位取出图片列表,返回列表及图片名 soup = bs(html, 'html.parser')#建BeautifulSoup对象 purls = [link.get('src') for link in soup.find_all('img')]#找到了所有含img的,放到列表里,取了src部分,即图片链接 #[<img src="https://bkssl.bdimg.com/static/wiki-lemma/widget/lemma_content/configModule/hotspotmining/img/logo_netease_715533d.png" />] # the title of the page as the picture fielname filename = soup.find('title').get_text()#这里截取了title作为图片名 return (purls, filename) def download(url, filename): #根据图片地址下载图片 pyopenssl.inject_into_urllib3() http = ul.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) res = http.request('GET', url) with open(filename, 'wb') as f: f.write(res.data) if __name__ == '__main__': crawler(link)