1 import requests,os,json,re 2 from urllib import request 3 from day3.mysql_text import mysql_conn 4 for i in range(0,60,20): 5 url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'.format(i) 6 print(url) 7 8 headers = { 9 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36' 10 } 11 12 response = requests.get(url,headers=headers) 13 html_json_dict = response.json() 14 15 16 # 获取dict中的data key对应的列表 17 18 19 20 data_list = html_json_dict['data'] 21 22 # 获取列表中含有article_url的值 23 for data_item in data_list: 24 if 'article_url' in data_item: 25 article_url = data_item['article_url'] 26 27 response = requests.get(article_url,headers=headers) 28 29 html_str = response.text 30 pattern = r'gallery: JSON.parse((.*)),' 31 32 match_res = re.search(pattern, html_str) 33 34 # 新建文件夹 35 if not os.path.exists('downloads'): 36 os.mkdir('downloads') 37 38 if match_res: 39 # print(match_res.group(1)) 40 json_origin = match_res.group(1) 41 a1 = json.loads(json_origin) 42 # print(a1,type(a1)) 43 a2 = json.loads(a1) 44 # print(a2['sub_images']) 45 for a2_list in a2['sub_images']: 46 image_url = a2_list['url'] 47 48 filename = 'downloads/' + image_url.split('/')[-1] + '.jpg' 49 print(filename) 50 request.urlretrieve(image_url, filename) 51 52 else: 53 pass