zoukankan      html  css  js  c++  java
  • 爬虫之requests模块

    安装requests模块

    pip install requests

    requests模块的常用方法, 属性, 参数

    import requests
    
    ret = requests.get(url='https://www.baidu.com', )
    # 原函数: get(url, params=None, **kwargs)
    ret.encoding = 'utf-8'  # 指定解析数据是使用的编码格式
    print(ret.content)  # 响应的数据, bytes类型
    print(ret.text)  # 响应的数据, str类型
    print(ret.url)  # 当前访问的url
    print(ret.headers, type(ret.headers))  # 响应头, 类型<class 'requests.structures.CaseInsensitiveDict'>, 和字典操作类似
    print(ret.json())  # 当响应的Content-Type为json时, 可以使用这个方法取json的数据
    
    
    params = {  # get请求URL中携带的请求的参数
        "keyword": "O98K",
    }
    header = {  # 请求头信息
        "name": "SATH"
    }
    ret = requests.get(url='http://www.baidu.com', params=params, header=header)
    data = {  # POST请求携带的参数
       "name": "sath"
    }

    爬虫案例一: 爬取搜狗指定词条搜索后的页面数据

    import requests
    
    url = "https://www.sogou.com/web"
    params = {
        "query": "apple"
    }
    # 根据对搜狗的请求分析, 发现提交搜索关键字的是https://www.sogou.com/web
    # 并且是以get方式发送的请求
    # 关键字是query
    
    ret = requests.get(url=url, params=params)
    with open('./sogou.html', 'w', encoding='utf-8') as f:
        f.write(ret.text)

    爬虫案例二: 爬取豆瓣电影分类排行榜中的电影详情数据

    import requests
    from multiprocessing import Pool
    import time
    
    url = 'https://movie.douban.com/j/new_search_subjects'
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    movie_title_list = []
    
    
    def get_movie(start):
        params = {
            "sort": "U",
            "tags": "",
            "start": start,
            "genres": "喜剧",
        }
        ret = requests.get(url=url, params=params, headers=header)
        if ret.headers['Content-Type'] == "application/json; charset=utf-8":
            data = ret.json()["data"]
            for movie in data:
                movie_title_list.append(movie["title"])
                print(movie["title"])
    
    
    if __name__ == '__main__':
        p = Pool(20)
        start = time.time()
        for n in range(0, 10000, 20):
            a = p.apply_async(get_movie, args=(n,))
        p.close()
        p.join()
        print(time.time() - start)
        # 14s, 还可以。。。。

    爬虫案例三: 爬取肯德基餐厅查询中指定地点的餐厅数据  

    import requests
    import json
    
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    data = {
        "cname": "",
        "pid": "",
        "keyword": "邯郸",
        "pageIndex": "1",
        "pageSize": "10",
    }
    ret = requests.post(url=url, headers=header, data=data, params={"op": "keyword"})
    res = json.loads(ret.text)
    print(res, type(res))

    爬虫案例四: 药监局信息爬取

    import requests
    from multiprocessing import Pool
    
    url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36"
    }
    ids = []
    for page in range(20, 250):
        data = {
            "on": "true",
            "page": page,
            "pageSize": "15",
            "productName": "",
            "conditionType": "1",
            "applyname": "",
            "applysn": "",
        }
        ret = requests.post(url=url, headers=header, data=data)
        if ret.headers['Content-Type'] == "application/json;charset=UTF-8":
            res = ret.json()["list"]
            for n in res:
                ids.append(n['ID'])
        else:
            pass
    url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById'
    
    
    def func(k):
        data = {
            "id": k
        }
        r = requests.post(url=url, headers=header, data=data)
        if r.headers['Content-Type'] == "application/json;charset=UTF-8":
            print(r.json()["businessPerson"])
    
    
    if __name__ == '__main__':
        p = Pool(14)
        for k in ids:
            s = p.apply_async(func, k)
        p.close()
        p.join()
  • 相关阅读:
    理解字节序(转)
    《逆向分析实战》数据的存储及表示形式
    C语言——内存分配
    C语言编程基础学习字符型数据的ASCII码值为何是负数?
    你知道嵌入式C语言中各变量存储的位置吗?
    stm32入门(从51过渡到32)
    说说M451例程讲解之LED
    说说M451的例程库的说明
    STM32总线结构和存储器
    STM32学习之路入门篇之指令集及cortex——m3的存储系统
  • 原文地址:https://www.cnblogs.com/594504110python/p/10066128.html
Copyright © 2011-2022 走看看