使用说明:只要输入你想要的关键字即可(input已经提示了- -),然后在第29行那放你想要的文件目录就行了
notice:由于页面可以翻页,翻页是通过pn参数实现的,每次加20(每页有20张图),所以第一页pg为0,第二页为20,以此类推,改一下第19行的内容就可以翻到你想翻的页数了
问题:我的代码有点问题,就是只能爬60张图片,原因未知。。。
1 import requests 2 import re 3 import time 4 5 def getHTML(url, kv, headers): 6 try: 7 r = requests.get(url, params = kv, headers = headers) 8 r.raise_for_status() 9 r.encoding = r.apparent_encoding 10 return r.text 11 except: 12 return "" 13 14 def parse(k): 15 url = 'https://image.baidu.com/search/flip' 16 t = 0 17 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} 18 start_time = time.time() 19 for pages in range(0, 510, 20): 20 kv = {'word':k, 'tn':'baiduimage', 'pn':pages} 21 html = getHTML(url, kv, headers) 22 if (html): 23 Res = re.findall('"objURL":"(.*?)",', html) 24 for url in Res: 25 try: 26 r = requests.get(url, headers = headers, timeout = 10) 27 t += 1 28 #这里是要保存的文件路径 29 with open('E:/Python/spider/Project/GetPicture/Picture/' + str(t) + '.jpg', 'wb') as f: 30 f.write(r.content) 31 print('爬取第{}个图片成功'.format(t)) 32 except: 33 print('爬取图片失败') 34 continue 35 end_time = time.time() 36 print('Total time:{}s'.format(end_time - start_time)) 37 38 def main(): 39 k = input('Please input keywords of the pictures you wanna get:') 40 parse(k) 41 42 if __name__ == '__main__': 43 main()