zoukankan html css js c++ java

Python: requests模块

最近在看B站上的视频学习资料，此文是关于requests模块香港的一些使用实例。

import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}

#1.爬取搜狗首页的页面源码数据
url = 'https://www.sougou.com/'
response = requests.get(url=url)
with open('./sougou.html','w',encoding='utf-8') as f:
    f.write(response.text)
print('download successfully')

#2. 简易的网页采集器
word = input('enter a key word:')
url = 'https://www.sogou.com/web'
params = {
    'query': word
}
# UA伪装
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
# 将伪装的UA作用到请求的请求头中
response = requests.get(url=url, params=params, headers=headers)
response.encoding = 'utf-8' # 手动修改响应对象的编码格式，处理乱码
page_text = response.text
filename = word + '.html'
with open(filename, 'w', encoding='utf-8') as f:
    f.write(page_text)
print(word, '下载成功')

# 3. 需求：爬取豆瓣电影的详细数据(动态加载数据)
# url:https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=
url = 'https://movie.douban.com/j/chart/top_list'
# 参数动态化
params = {
    'type': '24',
    'interval_id': '100:90',
    'action':'',
    'start': '0',
    'limit': '20',
}
response = requests.get(url=url,headers=headers,params=params)
page_text = response.json() # json返回序列化好的对象
# print(page_text)
for dic in page_text:
    name = dic['title']
    score = dic['score']
    print(name + ':' + score)

# 4. 抓取KFC餐厅查询：http://www.kfc.com.cn/kfccda/storelist/index.aspx
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
for page_num in range(1,8):
    data = {
        'cname':'',
        'pid': '',
        'keyword': '深圳',
        'pageIndex': str(page_num), # '1', #分页
        'pageSize': '10',
    }
    # 参数：data用来实现参数动态化，等同于get方法中的params参数的作用
    response = requests.post(url, headers=headers, data=data)
    page_text = response.json()
#     print(page_text)
    for dic in page_text['Table1']:
        pos = dic['addressDetail']
        print(pos)

# 5. 爬取药监总局中的企业详情信息：http://125.35.6.84:81/xk/
# 每家企业的详细信息：
# 捕获多页数据
url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
for page in range(1,6):
    data = {
        'on': 'true',
        'page': str(page),
        'pageSize': '15',
        'productName':'' ,
        'conditionType': '1',
        'applyname': '',
        'applysn': '',
    }
    response = requests.post(url, headers=headers, data=data)
    page_text = response.json()
    # print(page_text)
    for dic in page_text['list']:
        print(dic['ID'])
        detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
        detail_data = {
            'id': dic['ID'],
        }
        detail_response = requests.get(detail_url, headers=headers, params=detail_data)
        detail_page_text = detail_response.json()
        print(detail_page_text['businessPerson'] + ' ' + detail_page_text['epsAddress'])

# 6. 爬取图片
# 6.1 requests -可以实现UA伪装
url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
response = requests.get(url=url, headers=headers)
img_data = response.content # content返回的是bytes类型的响应数据
with open('./request_img.png', 'wb') as f:
    f.write(img_data)
# 6.2 urllib -无法实现UA伪装
from urllib import request
url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
request.urlretrieve(url=url, filename='./urllib_img.png')

# 7. 批量爬取站长素材图片
# <a target="_blank" href="http://sc.chinaz.com/tupian/200313466825.htm" alt="坐在沙发上看书的美女图片">
# <img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/202003/hpic2172_s.jpg" alt="坐在沙发上看书的美女图片">
# </a>
import re
url = 'http://sc.chinaz.com/tupian/meinvtupian.html'
page_text = requests.get(url=url, headers=headers).text # text获取字符创形式的响应数据
# 通过正则进行图片地址的解析
ex = '<a.*?<img src2="(.*?)" alt.*?</a>'
img_src_list = re.findall(ex, page_text, re.S) #re.S处理回车
# print(img_src_list)
img_id = 0
for img in img_src_list:
    img_id += 1
    img = img.replace('_s','') # 默认获取到的img地址是缩略图的地址，把_s去掉即为原图地址
    img_data = requests.get(img, headers=headers).content
    with open('./' + str(img_id) + '.png', 'wb+') as f:
        f.write(img_data)

# 8.爬取门店详细数据 https://m.vmall.com/help/hnrstoreaddr.htm
# 单个门店信息：https://openapi.vmall.com/mcp/offlineshop/getShopById + 参数    GET
# 各个门店ID信息：https://openapi.vmall.com/mcp/offlineshop/getShopList    POST
url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList'
data = {
     "portal":2,
     "lang":"zh-CN",
     "country":"CN",
     "brand":1,
     "province":"北京",
     "city":"北京",
     "pageNo":1,
     "pageSize":20
}
json_data_ids = requests.post(url=url, json=data, headers=headers).json()
# print(json_data_ids)
print(json_data_ids['shopInfos'])  # 此数据已经包含当个门店全部信息和下面代码去各个门店详细页面查找的数据一样

for dic in json_data_ids['shopInfos']:
    _id = dic['id']
    detail_url = 'https://openapi.vmall.com/mcp/offlineshop/getShopById'
    params = {
        'portal': '2',
        'version': '10',
        'country': 'CN',
        'shopId': _id,
        'lang': 'zh-CN',
    }
    shop_detail = requests.get(url=detail_url,headers=headers, params=params).json()
    print(shop_detail)

参考：https://www.bilibili.com/video/BV1tE411F7do （P1~P6）

查看全文

相关阅读:
leetcode 141 环形链表
 [转载]Tensorflow中reduction_indices 的用法
 SIFT特征原理与理解
 numpy切片和布尔型索引
 IPython的使用
 [文献阅读]基于卷积神经网络的高光谱图像深度特征提取与分类
 验证码校验
 防止表单重复提交
 MyBatis 一对一,一对多,多对多
 MySQL基础内容

原文地址：https://www.cnblogs.com/danvy/p/12725828.html