源码:
1 import requests 2 import json 3 import re 4 import os 5 from urllib import request 6 7 # 获取图集链接 8 def get_urls(offset,headers): 9 url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3&from=gallery'.format(offset) 10 response = requests.get(url,headers=headers) 11 res = response.json()['data'] 12 url_list = [] 13 for i in res: 14 if 'article_url' in i: 15 article_url = i['article_url'] 16 url_list.append(article_url) 17 return url_list 18 19 # 下载图片 20 def download_pictures(url, headers): 21 try: 22 response = requests.get(url,headers=headers) 23 # print(response.text) 24 print(url) 25 pat_dir = r'<title>(.*?)</title>' 26 dir_name = re.search(pat_dir,response.text).group(1) 27 print(dir_name) 28 # 正则匹配,图片地址 29 pat = r'gallery: JSON.parse((.*?))' 30 res = re.search(pat, response.text) 31 res = res.group(1) 32 json_str = json.loads(res) 33 json_dict = json.loads(json_str) 34 dic = json_dict['sub_images'] 35 dir_name = '街拍图/' + dir_name 36 if not os.path.exists(dir_name): 37 os.makedirs(dir_name) 38 39 for i in dic: 40 image_url = i['url'] 41 filename = dir_name + '/' + image_url.split('/')[-1] + '.jpg' 42 if not os.path.exists(filename): 43 print('正在下载:' + filename) 44 request.urlretrieve(image_url, filename) 45 except: 46 pass 47 48 49 if __name__ == '__main__': 50 headers = { 51 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 52 } 53 for offset in range(0,60,20): 54 url_list = get_urls(offset,headers) 55 for url in url_list: 56 download_pictures(url, headers)