zoukankan html css js c++ java

QQ空间Python爬虫（3）---终章

经测试上一节的代码成功跑通，接下来加上循环爬取所有说说-。-

完整代码：

 1 import requests
 2 import json
 3 import os
 4 import shutil
 5 import time
 6 
 7 qq = 627911861
 8 
 9 headers = {
10     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
11     'accept-encoding': 'gzip, deflate, br',
12     'accept-language': 'zh-CN,zh;q=0.8',
13     'cache-control': 'max-age=0',
14     'cookie': 'xxxxxx',
15     'upgrade-insecure-requests': '1',
16     'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Mobile Safari/537.36'
17 }
18 
19 url_x = 'https://mobile.qzone.qq.com/list?qzonetoken=9d29961d6fbb88be6236636010e0d4fde43a5b77d57ef984938b5aa0cb695e28c258a4d86b8c02a545bbcce970ff&g_tk=1573033187&res_attach=att%3D'
20 url_y = '%26tl%3D1508257557&format=json&list_type=shuoshuo&action=0&res_uin=627911861&count=40'
21 numbers = 0      # ‘查看更多’翻页
22 img_set = set()  # 存放图片url集
23 word_count = 0   # 文字说说计数器
24 words = ""       # 存放文字说说
25 images = ""      # 存放图片url
26 page = int(1761 / 40)
27 
28 
29 for i in range(0, page):
30     try:
31         html = requests.get(url_x + str(numbers) + url_y, headers=headers).content
32         data = json.loads(html)
33         # print(data)
34 
35         for vFeed in data['data']['vFeeds']:
36             if 'pic' in vFeed:
37                 for pic in vFeed['pic']['picdata']['pic']:
38                     img_set.add(pic['photourl']['0']['url'])
39 
40             if 'summary' in vFeed:
41                 # print(str(word_count) + '. ' + vFeed['summary']['summary'])
42                 words += str(word_count) + '. ' + vFeed['summary']['summary'] + '
'
43                 word_count += 1
44     except:
45         print('error')
46 
47     numbers += 40
48     time.sleep(10)
49 
50 try:
51     with open(os.getcwd() + '\' + str(qq) + '.txt', 'wb') as fo:
52         fo.write(words.encode('utf-8'))
53         print("文字说说写入完毕")
54 
55     with open(os.getcwd() + '\' + 'images_url', 'wb') as foImg:
56         for imgUrl in img_set:
57             images += imgUrl + '
'
58         foImg.write(images.encode('utf-8'))
59         print("图片写入完毕")
60 
61 except:
62     print('写入数据出错')
63 
64 
65 if not img_set:
66     print(u'不存在图片说说')
67 else:
68     image_path = os.getcwd() + 'images'
69     if os.path.exists(image_path) is False:
70         os.mkdir(image_path)
71     x = 1
72     for imgUrl in img_set:
73         temp = image_path + '/%s.jpg' % x
74         print(u'正在下载地%s张图片' % x)
75         try:
76             r = requests.get(imgUrl, stream=True)
77             if r.status_code == 200:
78                 with open(temp, 'wb') as f:
79                     r.raw.decode_content = True
80                     shutil.copyfileobj(r.raw, f)
81         except:
82             print(u'该图片下载失败:%s' % imgUrl)
83         x += 1

查看全文

相关阅读:
软件工程第二次作业
 软件工程第一次作业
 5T-时延小结
 4T--5G网络时延
 2T--网络切片
 1T--5G 三大应用场景
 2020软件工程第一次作业
 软件工程最后一次作业
 软件工程第四次作业
 软件工程第三次作业

原文地址：https://www.cnblogs.com/neilshi/p/7920738.html