今日内容:
- 爬取豌豆荚
爬取豌豆荚:
1.访问游戏主页
https://www.wandoujia.com/category/6001
2.点击查看更多,观察network内的请求
- 请求url
page2:
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=vbw9lj1sRQsRddx0hD-XqCNF
page3:
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=vbw9lj1sRQsRddx0hD-XqCNF
page4:
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=4&ctoken=vbw9lj1sRQsRddx0hD-XqCNF
3.循环拼接30个接口
4.解析返回的数据,获取每一个app数据
1 import requests
2 import re
3 from bs4 import BeautifulSoup
4
5 # 1.发送请求
6 def get_page(url):
7 response = requests.get(url)
8 return response
9 # 2.解析数据
10 def parse_data(text):
11 soup = BeautifulSoup(text, 'html.parser')
12 li_list= soup.find_all(name='li',attrs={'class': 'card'})
13 for li in li_list:
14 # 从a标签中找到title
15 app_name = li.find(name='a', attrs={'class': 'name'}).text
16 #获取url
17 detail_url = li.find(name='a', attrs={'class': 'name'}).attrs.get('href')
18 #获取下载人数
19 download_num = li.find(name='span', class_="install-count").text
20 # 获得app大小
21 app_size = li.find(name='span', attrs={"title": re.compile('d+MB')}).text
22 app_data = f'''
23 ❀================= 游戏信息 ==================❀
24 游戏名称:{app_name}
25 游戏地址:{detail_url}
26 下载人数:{download_num}
27 游戏大小:{app_size}
28 ❀============= 游戏信息加载完毕 ==============❀
29 '''
30 print(app_data)
31 with open('wdj.txt', 'a', encoding='utf-8') as f:
32 f.write(app_data)
33 f.flush()
34 if __name__ == '__main__':
35 for line in range(1, 2):
36 url = f'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=P5D2WskeLvknBIa'
37 print(url)
38 # 1.发送请求
39 response = get_page(url)
40 # 2.解析数据
41 data = response.json()
42 # 通过字典取值获取到li文本
43 text = data.get('data').get('content')
44 # 2.解析数据
45 parse_data(text)