1 from lxml import etree 2 import requests 3 from urllib import request 4 5 url = 'http://www.haoduanzi.com/' 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 8 } 9 url_content = requests.get(url, headers=headers).text 10 11 tree = etree.HTML(url_content) 12 13 div_list = tree.xpath('//div[@id="main"]/div')[2:-1] 14 15 i = 0 16 for div in div_list: 17 img_url = div.xpath('./div/img/@src')[0] 18 img_content = requests.get(url=img_url, headers=headers).content 19 request.urlretrieve(url=img_url, filename='img' + str(i) + '.jpg') 20 i += 1
不要采用IO操作,容易出现问题,for循环执行效率要快于with open的效率,错误代码如下:
1 from lxml import etree 2 import requests 3 from uuid import uuid4 4 import time 5 from urllib import request 6 7 url = 'http://www.haoduanzi.com/' 8 headers = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 10 } 11 url_content = requests.get(url, headers=headers).text 12 13 tree = etree.HTML(url_content) 14 15 div_list = tree.xpath('//div[@id="main"]/div')[2:-1] 16 filename = uuid4() 17 # i = 0 18 for div in div_list: 19 img_url = div.xpath('./div/img/@src')[0] 20 img_content = requests.get(url=img_url, headers=headers).content 21 # request.urlretrieve(url=img_url, filename='img' + str(i) + '.jpg') 22 # i += 1 23 time.sleep(2) 24 with open(r'C:jupyterday02\%s.jpg' % filename, 'wb') as f: 25 f.write(img_content)