zoukankan      html  css  js  c++  java
  • Python: requests模块

    最近在看B站上的视频学习资料,此文是关于requests模块香港的一些使用实例。

    import requests
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    }
    
    #1.爬取搜狗首页的页面源码数据
    url = 'https://www.sougou.com/'
    response = requests.get(url=url)
    with open('./sougou.html','w',encoding='utf-8') as f:
        f.write(response.text)
    print('download successfully')
    
    #2. 简易的网页采集器
    word = input('enter a key word:')
    url = 'https://www.sogou.com/web'
    params = {
        'query': word
    }
    # UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    }
    # 将伪装的UA作用到请求的请求头中
    response = requests.get(url=url, params=params, headers=headers)
    response.encoding = 'utf-8' # 手动修改响应对象的编码格式,处理乱码
    page_text = response.text
    filename = word + '.html'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(page_text)
    print(word, '下载成功')
    
    # 3. 需求:爬取豆瓣电影的详细数据(动态加载数据)
    # url:https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=
    url = 'https://movie.douban.com/j/chart/top_list'
    # 参数动态化
    params = {
        'type': '24',
        'interval_id': '100:90',
        'action':'',
        'start': '0',
        'limit': '20',
    }
    response = requests.get(url=url,headers=headers,params=params)
    page_text = response.json() # json返回序列化好的对象
    # print(page_text)
    for dic in page_text:
        name = dic['title']
        score = dic['score']
        print(name + ':' + score)
    
    # 4. 抓取KFC餐厅查询:http://www.kfc.com.cn/kfccda/storelist/index.aspx
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    for page_num in range(1,8):
        data = {
            'cname':'',
            'pid': '',
            'keyword': '深圳',
            'pageIndex': str(page_num), # '1', #分页
            'pageSize': '10',
        }
        # 参数:data用来实现参数动态化,等同于get方法中的params参数的作用
        response = requests.post(url, headers=headers, data=data)
        page_text = response.json()
    #     print(page_text)
        for dic in page_text['Table1']:
            pos = dic['addressDetail']
            print(pos)
    
    # 5. 爬取药监总局中的企业详情信息:http://125.35.6.84:81/xk/
    # 每家企业的详细信息:
    # 捕获多页数据
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    for page in range(1,6):
        data = {
            'on': 'true',
            'page': str(page),
            'pageSize': '15',
            'productName':'' ,
            'conditionType': '1',
            'applyname': '',
            'applysn': '',
        }
        response = requests.post(url, headers=headers, data=data)
        page_text = response.json()
        # print(page_text)
        for dic in page_text['list']:
            print(dic['ID'])
            detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
            detail_data = {
                'id': dic['ID'],
            }
            detail_response = requests.get(detail_url, headers=headers, params=detail_data)
            detail_page_text = detail_response.json()
            print(detail_page_text['businessPerson'] + ' ' + detail_page_text['epsAddress'])
    
    # 6. 爬取图片
    # 6.1 requests -可以实现UA伪装
    url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
    response = requests.get(url=url, headers=headers)
    img_data = response.content # content返回的是bytes类型的响应数据
    with open('./request_img.png', 'wb') as f:
        f.write(img_data)
    # 6.2 urllib -无法实现UA伪装
    from urllib import request
    url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
    request.urlretrieve(url=url, filename='./urllib_img.png')
    
    # 7. 批量爬取站长素材图片
    # <a target="_blank" href="http://sc.chinaz.com/tupian/200313466825.htm" alt="坐在沙发上看书的美女图片">
    # <img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/202003/hpic2172_s.jpg" alt="坐在沙发上看书的美女图片">
    # </a>
    import re
    url = 'http://sc.chinaz.com/tupian/meinvtupian.html'
    page_text = requests.get(url=url, headers=headers).text # text获取字符创形式的响应数据
    # 通过正则进行图片地址的解析
    ex = '<a.*?<img src2="(.*?)" alt.*?</a>'
    img_src_list = re.findall(ex, page_text, re.S) #re.S处理回车
    # print(img_src_list)
    img_id = 0
    for img in img_src_list:
        img_id += 1
        img = img.replace('_s','') # 默认获取到的img地址是缩略图的地址,把_s去掉即为原图地址
        img_data = requests.get(img, headers=headers).content
        with open('./' + str(img_id) + '.png', 'wb+') as f:
            f.write(img_data)
    
    # 8.爬取门店详细数据 https://m.vmall.com/help/hnrstoreaddr.htm
    # 单个门店信息:https://openapi.vmall.com/mcp/offlineshop/getShopById + 参数    GET
    # 各个门店ID信息:https://openapi.vmall.com/mcp/offlineshop/getShopList    POST
    url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList'
    data = {
         "portal":2,
         "lang":"zh-CN",
         "country":"CN",
         "brand":1,
         "province":"北京",
         "city":"北京",
         "pageNo":1,
         "pageSize":20
    }
    json_data_ids = requests.post(url=url, json=data, headers=headers).json()
    # print(json_data_ids)
    print(json_data_ids['shopInfos'])  # 此数据已经包含当个门店全部信息和下面代码去各个门店详细页面查找的数据一样
    
    for dic in json_data_ids['shopInfos']:
        _id = dic['id']
        detail_url = 'https://openapi.vmall.com/mcp/offlineshop/getShopById'
        params = {
            'portal': '2',
            'version': '10',
            'country': 'CN',
            'shopId': _id,
            'lang': 'zh-CN',
        }
        shop_detail = requests.get(url=detail_url,headers=headers, params=params).json()
        print(shop_detail)

    参考: (P1~P6)

  • 相关阅读:
    Java实现花朵数
    Java实现花朵数
    Java实现花朵数
    Java实现花朵数
    Java实现花朵数
    Java实现 黑洞数
    Java实现 黑洞数
    Java实现 黑洞数
    Could not create the view: An unexpected exception was thrown.问题解决
    让 SVN (TortoiseSVN)提交时忽略指定目录
  • 原文地址:https://www.cnblogs.com/danvy/p/12725828.html
Copyright © 2011-2022 走看看