最近在看B站上的视频学习资料,此文是关于requests模块香港的一些使用实例。
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
#1.爬取搜狗首页的页面源码数据
url = 'https://www.sougou.com/'
response = requests.get(url=url)
with open('./sougou.html','w',encoding='utf-8') as f:
f.write(response.text)
print('download successfully')
#2. 简易的网页采集器
word = input('enter a key word:')
url = 'https://www.sogou.com/web'
params = {
'query': word
}
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
# 将伪装的UA作用到请求的请求头中
response = requests.get(url=url, params=params, headers=headers)
response.encoding = 'utf-8' # 手动修改响应对象的编码格式,处理乱码
page_text = response.text
filename = word + '.html'
with open(filename, 'w', encoding='utf-8') as f:
f.write(page_text)
print(word, '下载成功')
# 3. 需求:爬取豆瓣电影的详细数据(动态加载数据)
# url:https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=
url = 'https://movie.douban.com/j/chart/top_list'
# 参数动态化
params = {
'type': '24',
'interval_id': '100:90',
'action':'',
'start': '0',
'limit': '20',
}
response = requests.get(url=url,headers=headers,params=params)
page_text = response.json() # json返回序列化好的对象
# print(page_text)
for dic in page_text:
name = dic['title']
score = dic['score']
print(name + ':' + score)
# 4. 抓取KFC餐厅查询:http://www.kfc.com.cn/kfccda/storelist/index.aspx
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
for page_num in range(1,8):
data = {
'cname':'',
'pid': '',
'keyword': '深圳'