import requests from requests.adapters import HTTPAdapter import re from urllib import parse import os def getpiclist(page, kw): headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'dy2018=1; statinfo=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DiqbEA3_P4ej6UVrv29ikk9Rs4ZXRHyh4c28G8AeYd37%26wd%3D%26eqid%3Dd6bb34b50005e6a2000000065be52571; Hm_lvt_a4fa2a41b865534a782ceef2185fffaf=1541744017; Hm_lpvt_a4fa2a41b865534a782ceef2185fffaf=1541744029', 'Host': 'soso.huitu.com', 'Referer': 'http://soso.huitu.com/search?kw='+kw+'&page=1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest' } url = "http://soso.huitu.com/search/?kw="+kw+"&page=" + str(page) + "&listtype=2" response = requests.get(url, headers=headers) if response.status_code == 200: # print(response.text) piclist = re.findall("<img src="(http://picddd.huitu.com/pic/d+/d+_d+_0.jpg)" />", response.text) # print(piclist) else: piclist = "not find url" return piclist def downloadpic(url, savepath, s): try: pic = s.get(url, timeout=5) picname = url.rsplit("/", 1)[1] fp = open(savepath + picname, 'wb') fp.write(pic.content) fp.close() except requests.exceptions.ConnectionError: print(url,"【错误】当前图片无法下载") except requests.exceptions.ReadTimeout: print(url, "【错误】超时") except requests.exceptions.ChunkedEncodingError: print(url, "【错误】远程主机强迫关闭了一个现有的连接") except requests.exceptions.RequestException as e: print(url, "【错误】", e) def main(): words = ["芒果"] #需要下载的名称 for word in words: start = 1 #起始页数 pagenum = 10 #需要下载的页数 kw = parse.quote(str(word)) savepath = "./" + word + "/" if not os.path.exists(savepath): os.makedirs(savepath) s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=2)) s.mount('https://', HTTPAdapter(max_retries=2)) for i in range(start, pagenum): picturelist = getpiclist(i, kw) if type(picturelist) is list: print(f"爬取第 {str(i)} 页内容") for picurl in picturelist: print(f"正在下载第 {str(i)} 页第 {picturelist.index(picurl)} 张图片") # picurl = picurl.replace("/pic/", "/res/", 1) # picurl = picurl.replace("_0.", "_1.", 1) downloadpic(picurl, savepath, s) else: print("爬取完成") break if __name__ == "__main__": main()