这里用的是json+re+requests+beautifulsoup+多线程
1 import json
2 import re
3 from multiprocessing.pool import Pool
4
5 import requests
6 from bs4 import BeautifulSoup
7 from config import *
8 from requests import RequestException
9
10
11 def get_page_index(offset, keyword):
12 '''得到一个页面的索引'''
13 data = {
14 'offset': offset,
15 'format': 'json',
16 'keyword': keyword,
17 'autoload': 'true',
18 'count': '20',
19 'cur_tab': '1',
20 'from': 'search_tab'
21 }
22 # 请求方式一
23 # url = 'https://www.toutiao.com/search_content/?'+urlencode(data)
24 # response = requests.get(url)
25
26 # 请求方式二
27 url = 'https://www.toutiao.com/search_content/'
28 try:
29 response = requests.get(url, params=data)
30 if response.status_code == 200:
31 return response.text
32 return None
33 except RequestException:
34 return None
35
36
37 def parse_page_index(html):
38 '''解析json数据'''
39 data = json.loads(html)
40 if data and 'data' in data.keys():
41 for item in data.get('data'):
42 yield item.get('article_url')
43
44
45 def get_page_detail(url):
46 '''得到详情页的数据'''
47 # 添加的请求头
48 headers = {
49 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
50 }
51 try:
52 response = requests.get(url, headers=headers)
53 if response.status_code == 200:
54 return response.text
55 return None
56 except RequestException:
57 return None
58
59
60 def parse_page_detail(html, url):
61 '''解析详情页数据'''
62 soup = BeautifulSoup(html, 'lxml')
63 t = soup.select('title')
64 for i in t:
65 title = i.get_text()
66
67 pattern = re.compile('gallery: JSON.parse("(.*?)"),', re.S)
68 result = re.search(pattern, html)
69 if result:
70
71 # print(result.group(1))
72 d = re.sub('\\', '', result.group(1))
73 # print(d)
74 data = json.loads(d)
75 if data:
76 images = [item.get('url') for item in data.get('sub_images')]
77 for image in images:
78 download_image(image, title)
79 return {
80 'title': title,
81 'url': url,
82 'images': images
83 }
84 else:
85 None
86
87
88 def download_image(url, title):
89 '''
90 图片下载
91 :param url: 下载的连接
92 :return:
93 '''
94 print('正在下载', url)
95 try:
96 response = requests.get(url)
97 if response.status_code == 200:
98 content = response.content
99 save_to_image(content, title)
100 return None
101 except RequestException:
102 return None
103
104
105 count = 0
106
107
108 def save_to_image(content, title):
109 global count
110 '''
111 保存图片文件
112 :param content: 图片文件的内容
113 :return:
114 '''
115 name = title + str(count)
116 file_path = './头条/{}.{}'.format(name, 'jpg')
117 with open(file_path, 'wb') as f:
118 count += 1
119 f.write(content)
120
121
122 def main(offset):
123 '''主程序入口'''
124 html = get_page_index(offset, '街拍')
125
126 # print(html)
127 for url in parse_page_index(html):
128
129 if url:
130 # print(url)
131 html = get_page_detail(url)
132 if html:
133 # print(parse_page_detail(html, url))
134 result = parse_page_detail(html, url)
135 if result:
136 print(result)
137 # save_to_mongo(result)
138
139
140 GROUP_START = 1
141 GROUP_END = 20
142 if __name__ == '__main__':
143 groups = [i * 20 for i in range(GROUP_START, GROUP_END)]
144 pool = Pool()
145 pool.map(main, groups)