1 import urllib.request as ur 2 import urllib.error as ue 3 import re 4 # 目标网址 5 url = 'https://list.jd.com/list.html?cat=670,671,672' 6 # 存放路径 7 save_path = 'E:/workspace/PyCharm/codeSpace/books/python_web_crawler_book/chapter6/demo1/images/' 8 # 代理服务器ip 9 proxy_add = '115.174.66.148:8118' 10 11 def get_JD_pictures(url, save_path, proxy_add, page): 12 # 根据页面设置url 13 url = url+"&page="+str(page) 14 # 添加报头 15 req = ur.Request(url) 16 req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0') 17 18 # 设置代理 19 proxy = ur.ProxyHandler({'http': proxy_add}) 20 opener = ur.build_opener(proxy, ur.HTTPHandler) 21 ur.install_opener(opener) 22 23 # 爬取页面 24 info = ur.urlopen(req).read() 25 # 信息存档 26 info = str(info) 27 pattern_1 = '<div id="plist".+? <div class="page clearfix">' 28 info = re.compile(pattern=pattern_1).findall(info) 29 info = info[0] 30 pattern_2 = '<img width="220" height="220" data-img="1" src="//(.+?.jpg)">' 31 image_list = re.compile(pattern=pattern_2).findall(info) 32 x = 1 33 for image_url in image_list: 34 image_name = save_path+str(page)+"_"+str(x)+".jpg" 35 image_url = "http://"+image_url 36 try: 37 ur.urlretrieve(image_url, filename=image_name) 38 except ue.HTTPError as e: 39 if hasattr(e, 'code'): 40 print(e.code) 41 if hasattr(e, 'reason'): 42 print(e.reason) 43 except ue.URLError as e: 44 if hasattr(e, 'code'): 45 print(e.code) 46 if hasattr(e, 'reason'): 47 print(e.reason) 48 x += 1 49 50 get_JD_pictures(url, save_path, proxy_add, 1)