zoukankan      html  css  js  c++  java
  • Python: requests模块

    最近在看B站上的视频学习资料,此文是关于requests模块香港的一些使用实例。

    import requests
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    }
    
    #1.爬取搜狗首页的页面源码数据
    url = 'https://www.sougou.com/'
    response = requests.get(url=url)
    with open('./sougou.html','w',encoding='utf-8') as f:
        f.write(response.text)
    print('download successfully')
    
    #2. 简易的网页采集器
    word = input('enter a key word:')
    url = 'https://www.sogou.com/web'
    params = {
        'query': word
    }
    # UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    }
    # 将伪装的UA作用到请求的请求头中
    response = requests.get(url=url, params=params, headers=headers)
    response.encoding = 'utf-8' # 手动修改响应对象的编码格式,处理乱码
    page_text = response.text
    filename = word + '.html'
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(page_text)
    print(word, '下载成功')
    
    # 3. 需求:爬取豆瓣电影的详细数据(动态加载数据)
    # url:https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action=
    url = 'https://movie.douban.com/j/chart/top_list'
    # 参数动态化
    params = {
        'type': '24',
        'interval_id': '100:90',
        'action':'',
        'start': '0',
        'limit': '20',
    }
    response = requests.get(url=url,headers=headers,params=params)
    page_text = response.json() # json返回序列化好的对象
    # print(page_text)
    for dic in page_text:
        name = dic['title']
        score = dic['score']
        print(name + ':' + score)
    
    # 4. 抓取KFC餐厅查询:http://www.kfc.com.cn/kfccda/storelist/index.aspx
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    for page_num in range(1,8):
        data = {
            'cname':'',
            'pid': '',
            'keyword': '深圳',
            'pageIndex': str(page_num), # '1', #分页
            'pageSize': '10',
        }
        # 参数:data用来实现参数动态化,等同于get方法中的params参数的作用
        response = requests.post(url, headers=headers, data=data)
        page_text = response.json()
    #     print(page_text)
        for dic in page_text['Table1']:
            pos = dic['addressDetail']
            print(pos)
    
    # 5. 爬取药监总局中的企业详情信息:http://125.35.6.84:81/xk/
    # 每家企业的详细信息:
    # 捕获多页数据
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    for page in range(1,6):
        data = {
            'on': 'true',
            'page': str(page),
            'pageSize': '15',
            'productName':'' ,
            'conditionType': '1',
            'applyname': '',
            'applysn': '',
        }
        response = requests.post(url, headers=headers, data=data)
        page_text = response.json()
        # print(page_text)
        for dic in page_text['list']:
            print(dic['ID'])
            detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
            detail_data = {
                'id': dic['ID'],
            }
            detail_response = requests.get(detail_url, headers=headers, params=detail_data)
            detail_page_text = detail_response.json()
            print(detail_page_text['businessPerson'] + ' ' + detail_page_text['epsAddress'])
    
    # 6. 爬取图片
    # 6.1 requests -可以实现UA伪装
    url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
    response = requests.get(url=url, headers=headers)
    img_data = response.content # content返回的是bytes类型的响应数据
    with open('./request_img.png', 'wb') as f:
        f.write(img_data)
    # 6.2 urllib -无法实现UA伪装
    from urllib import request
    url = 'http://pics.sc.chinaz.com/files/pic/pic9/202004/zzpic24425.jpg'
    request.urlretrieve(url=url, filename='./urllib_img.png')
    
    # 7. 批量爬取站长素材图片
    # <a target="_blank" href="http://sc.chinaz.com/tupian/200313466825.htm" alt="坐在沙发上看书的美女图片">
    # <img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/202003/hpic2172_s.jpg" alt="坐在沙发上看书的美女图片">
    # </a>
    import re
    url = 'http://sc.chinaz.com/tupian/meinvtupian.html'
    page_text = requests.get(url=url, headers=headers).text # text获取字符创形式的响应数据
    # 通过正则进行图片地址的解析
    ex = '<a.*?<img src2="(.*?)" alt.*?</a>'
    img_src_list = re.findall(ex, page_text, re.S) #re.S处理回车
    # print(img_src_list)
    img_id = 0
    for img in img_src_list:
        img_id += 1
        img = img.replace('_s','') # 默认获取到的img地址是缩略图的地址,把_s去掉即为原图地址
        img_data = requests.get(img, headers=headers).content
        with open('./' + str(img_id) + '.png', 'wb+') as f:
            f.write(img_data)
    
    # 8.爬取门店详细数据 https://m.vmall.com/help/hnrstoreaddr.htm
    # 单个门店信息:https://openapi.vmall.com/mcp/offlineshop/getShopById + 参数    GET
    # 各个门店ID信息:https://openapi.vmall.com/mcp/offlineshop/getShopList    POST
    url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList'
    data = {
         "portal":2,
         "lang":"zh-CN",
         "country":"CN",
         "brand":1,
         "province":"北京",
         "city":"北京",
         "pageNo":1,
         "pageSize":20
    }
    json_data_ids = requests.post(url=url, json=data, headers=headers).json()
    # print(json_data_ids)
    print(json_data_ids['shopInfos'])  # 此数据已经包含当个门店全部信息和下面代码去各个门店详细页面查找的数据一样
    
    for dic in json_data_ids['shopInfos']:
        _id = dic['id']
        detail_url = 'https://openapi.vmall.com/mcp/offlineshop/getShopById'
        params = {
            'portal': '2',
            'version': '10',
            'country': 'CN',
            'shopId': _id,
            'lang': 'zh-CN',
        }
        shop_detail = requests.get(url=detail_url,headers=headers, params=params).json()
        print(shop_detail)

    参考: (P1~P6)

  • 相关阅读:
    【原】Coursera—Andrew Ng机器学习—课程笔记 Lecture 15—Anomaly Detection异常检测
    【原】Coursera—Andrew Ng机器学习—课程笔记 Lecture 14—Dimensionality Reduction 降维
    【原】Coursera—Andrew Ng机器学习—课程笔记 Lecture 13—Clustering 聚类
    【原】Coursera—Andrew Ng机器学习—课程笔记 Lecture 12—Support Vector Machines 支持向量机
    【原】机器学习公开课 目录(课程笔记、测验习题答案、编程作业源码)...持续更新...
    【原】Coursera—Andrew Ng机器学习—Week 11 习题—Photo OCR
    【原】Coursera—Andrew Ng机器学习—Week 10 习题—大规模机器学习
    【原】Coursera—Andrew Ng机器学习—Week 9 习题—异常检测
    【原】Coursera—Andrew Ng机器学习—Week 8 习题—聚类 和 降维
    【原】Coursera—Andrew Ng机器学习—Week 7 习题—支持向量机SVM
  • 原文地址:https://www.cnblogs.com/danvy/p/12725828.html
Copyright © 2011-2022 走看看