zoukankan html css js c++ java

Python 爬虫 (四)

requests: 练手雪qiu网

 1 import requests
 2 import json
 3 import re
 4 import pymysql
 5 url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'
 6 headers = {
 7     'Cookie': 'aliyungf_tc=AQAAALoQF3p02gsAUhVFebQ3uBBNZn+H; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; _ga=GA1.2.516718356.1534295265; _gid=GA1.2.1050085592.1534295265; u=301534295266356; device_id=f5c21e143ce8060c74a2de7cbcddf0b8; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534295265,1534295722; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534295722',
 8     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
 9 }
10 res = requests.get(url, headers=headers) #get请求,将url和头文件一并传入
11 response = res.content.decode('utf-8')  # content 和 request下的.read()作用是一样的
12 response = json.loads(response)#分析第一个页面 从第一个页面中提取下一个Ajax请求所需要的id
13 while True:
14     if response['next_id'] != None:
15         url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id={}&count=15&category=-1'
16         fullurl = url.format(response['next_id']) #通过上面获取的id将需要浏览的下一个网页的url进行补全
17         res = requests.get(fullurl, headers=headers)
18         response = res.content.decode('utf-8')
19         response = json.loads(response) #response是个字典
20         # print(response)
21         ans_text = response['list']
22         # print(type(ans_text)) #list
23         for i in ans_text:
24             #i是字典
25             rul  = re.compile(r'"topic_title":"(.*?)",') #通过正则将需要的数据提取出来
26             rul2 = re.compile(r'"description":"(.*?)",')
27             ans1 = rul.findall(i['data'])
28             ans2 = rul2.findall(i['data'])
29             print(ans1)  #type 为list
30             print(ans2)
31             print('-' * 50)
32             
33             #将数据存到数据库中
34             connection = pymysql.connect(host='localhost',user='root',password='1234',db='xq')
35         try:
36             with connection.cursor() as cursor:
37                 # Create a new record
38                 sql = "INSERT INTO `xq_info` (`title`, `info`) VALUES (%s, %s)"
39                 cursor.execute(sql, (ans1[0],ans2[0]))
40             connection.commit()
41         finally:
42             connection.close()
43         print('&'*50)
44     else:
45         break

查看全文

相关阅读:
MySQL数据库封装和分页查询
 程序员的价值在哪里？
奇葩的程序员
 京东咚咚架构演进
 程序员必看的《黑客帝国》，你看懂了吗？
微信后台技术“干货们”带来的启发
 drf框架 2 drf框架的请求生命周期(as_view和dispatch方法), 请求、解析、渲染、响应、异常, 序列化组件，ORM配置回顾(media文件配置)，应用在settings.py中INSTALLED_APPS注册意义，数据库配置
 drf框架, 接口(api) Django FBV => CBV drf框架的基础试图类 drf核心组件群查与单查 python换源
 前端Vue框架 05 第三方插件(vuex: 组件间交互的(移动端), axios
前端Vue框架 04 路由：逻辑跳转、路由传参项目组件的数据局部化处理data(){ return{} } 组件的生命周期钩子组件间通信全局配置css, js

原文地址：https://www.cnblogs.com/pantom0122/p/9484708.html

Python 爬虫 (四)

requests: 练手 雪qiu网

requests: 练手雪qiu网