zoukankan      html  css  js  c++  java
  • Python Requests

    笔记-requests详解(二)

    实例引入

    import requests
     
    response = requests.get('https://www.baidu.com/')
    print(type(response))
    print(response.status_code)
    print(type(response.text))
    print(response.text)
    print(response.cookies)

    各种请求方式

    import requests
    requests.post('http://httpbin.org/post')
    requests.put('http://httpbin.org/put')
    requests.delete('http://httpbin.org/delete')
    requests.head('http://httpbin.org/get')
    requests.options('http://httpbin.org/get')

    请求

    基本GET请求

    基本写法

    import requests
     
    response = requests.get('http://httpbin.org/get')
    print(response.text)

    带参数GET请求

    import requests
    response = requests.get("http://httpbin.org/get?name=germey&age=22")
    print(response.text)
    import requests
     
    data = {
        'name': 'germey',
        'age': 22
    }
    response = requests.get("http://httpbin.org/get", params=data)
    print(response.text)

    解析json

    import requests
    import json
     
    response = requests.get("http://httpbin.org/get")
    print(type(response.text))
    print(response.json())
    print(json.loads(response.text))
    print(type(response.json()))

    获取二进制数据

    import requests
     
    response = requests.get("https://github.com/favicon.ico")
    print(type(response.text), type(response.content))
    print(response.text)
    print(response.content)
    import requests
     
    response = requests.get("https://github.com/favicon.ico")
    with open('favicon.ico', 'wb') as f:
        f.write(response.content)
        f.close()

    添加headers

    import requests
     
    response = requests.get("https://www.zhihu.com/explore")
    print(response.text)
    import requests
     
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    }
    response = requests.get("https://www.zhihu.com/explore", headers=headers)
    print(response.text)

    基本POST请求

    import requests
     
    data = {'name': 'germey', 'age': '22'}
    response = requests.post("http://httpbin.org/post", data=data)
    print(response.text)
    import requests
     
    data = {'name': 'germey', 'age': '22'}
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
    }
    response = requests.post("http://httpbin.org/post", data=data, headers=headers)
    print(response.json())

    响应

    reponse属性

    import requests
     
    response = requests.get('http://www.jianshu.com')
    print(type(response.status_code), response.status_code)
    print(type(response.headers), response.headers)
    print(type(response.cookies), response.cookies)
    print(type(response.url), response.url)
    print(type(response.history), response.history)
    import requests
     
    response = requests.get('http://www.jianshu.com/hello.html')
    exit() if not response.status_code == requests.codes.not_found else print('404 Not Found')
    import requests
     
    response = requests.get('http://www.jianshu.com')
    exit() if not response.status_code == 200 else print('Request Successfully')

    高级操作

    文件上传

    import requests
     
    files = {'file': open('favicon.ico', 'rb')}
    response = requests.post("http://httpbin.org/post", files=files)
    print(response.text)

    获取cookie

    import requests
     
    response = requests.get("https://www.baidu.com")
    print(response.cookies)
    for key, value in response.cookies.items():
        print(key + '=' + value)

    会话维持

    模拟登录

    import requests
     
    requests.get('http://httpbin.org/cookies/set/number/123456789')
    response = requests.get('http://httpbin.org/cookies')
    print(response.text)
    import requests
     
    s = requests.Session()
    s.get('http://httpbin.org/cookies/set/number/123456789')
    response = s.get('http://httpbin.org/cookies')
    print(response.text)
    import requests
    s = requests.session()
     
    data = {'form_email':'账户','form_password':'密码'}
    #post 换成登录的地址,
    s.post('https://www.douban.com/accounts/login',data)
    #换成抓取的地址
    response=s.get('https://movie.douban.com/top250')
    print(response.text)#显示以下信息,说明已登陆成功

    证书验证

    import requests
     
    response = requests.get('https://www.12306.cn')
    print(response.status_code)
    import requests
    from requests.packages import urllib3
    urllib3.disable_warnings()
    response = requests.get('https://www.12306.cn', verify=False)
    print(response.status_code)
    import requests
     
    response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
    print(response.status_code)

    代理设置

    import requests
     
    proxies = {
      "http": "http://127.0.0.1:9743",
      "https": "https://127.0.0.1:9743",
    }
     
    response = requests.get("https://www.taobao.com", proxies=proxies)
    print(response.status_code)
    import requests
     
    proxies = {
        "http": "http://user:password@127.0.0.1:9743/",
    }
    response = requests.get("https://www.taobao.com", proxies=proxies)
    print(response.status_code)

    超时设置

    import requests
    from requests.exceptions import ReadTimeout
    try:
        response = requests.get("http://httpbin.org/get", timeout = 0.5)
        print(response.status_code)
    except ReadTimeout:
        print('Timeout')

    认证设置

    import requests
    from requests.auth import HTTPBasicAuth
     
    r = requests.get('http://120.27.34.24:9001', auth=HTTPBasicAuth('user', '123'))
    print(r.status_code)
    import requests
     
    r = requests.get('http://120.27.34.24:9001', auth=('user', '123'))
    print(r.status_code)

    异常处理

    import requests
    from requests.exceptions import ReadTimeout, ConnectionError, RequestException
    try:
        response = requests.get("http://httpbin.org/get", timeout = 0.5)# timeout 仅对连接过程有效,与响应体的下载无关。# timeout 并不是整个下载响应的时间限制,而是如果服务器在15 秒内没有应答,将会引发一个异常(更精确地说,是在 timeout 秒内没有从基础套接字上接收到任何字节的数据时)
        print(response.status_code)
    except ReadTimeout:
        print('Timeout')
    except ConnectionError:
        print('Connection error')
    except RequestException:
        print('Error')

    json入参嵌套

    方式一:

     
    import requests,json
     
    url = "http://xxx"
    headers = {"Content-Type":"application/json","Authorization":"Bearer token值"}
    data1 = {"key1":"value1","key2":"value2"}
    data = {"params":data1}
    
    r = request.post(url = url,data = json.dumps(data),headers = headers)
     

    方式二:

     
    import requests,json
     
    url = "http://xxx"
    headers = {"Content-Type":"application/json","Authorization":"Bearer token值"}
    data1 = {"key1":"value1","key2":"value2"}
    data = {"params":data1}
    
    r = request.post(url = url,json = data,headers = headers)
     

    Python Requests 小技巧总结

    1:重定向
    网络请求中可能会遇到重定向,我们需要一次处理一个请求,可以把重定向禁止。

    self.session.post(url,data,allow_redirects=False)

    2:写接口请求,debug时,会需要看下代码请求的详细信息,当然我们可以使用fiddler来查看,其实我们自己也可以在代码这样获取debug信息*

    import requests
    import logging
    import httplib as http_client
    
    http_client.HTTPConnection.debuglevel = 1
    
    logging.basicConfig()
    logging.getLogger().setLevel(logging.DEBUG)
    requests_log = logging.getLogger("requests.packages.urllib3")
    requests_log.setLevel(logging.DEBUG)
    requests_log.propagate = True
    
    requests.get('https://www.baidu.com')
    
    #更好的方法是自己封装一个装饰器,就可以为任意请求函数添加一个debug功能。

    3:统计一个API请求花费的时间,我们可以使用如下方法

    self.session.get(url).elapsed 

    4:发送自定义cookies
    我们使用Session实例来保持请求之间的cookies,但是有些特殊情况,需要使用自定义的cookies
    我们可以这样

    # 自定义cookies
    cookie = {'guid':'5BF0FAB4-A7CF-463E-8C17-C1576fc7a9a8','uuid':'3ff5f4091f35a467'}
    
    session.post('http://wikipedia.org', cookies=cookie) 
    
    

    python中requests.session的妙用

     

    在进行接口测试的时候,我们会调用多个接口发出多个请求,在这些请求中有时候需要保持一些共用的数据,例如cookies信息。

    1、requests库的session对象能够帮我们跨请求保持某些参数,也会在同一个session实例发出的所有请求之间保持cookies。

    s = requests.session()
    # req_param = '{"belongId": "300001312","userName": "alitestss003","password":"pxkj88","captcha":"pxpx","captchaKey":"59675w1v8kdbpxv"}'
    # res = s.post('http://test.e.fanxiaojian.cn/metis-in-web/auth/login', json=json.loads(req_param))
    # # res1 = s.get("http://test.e.fanxiaojian.cn/eos--web/analysis/briefing")
    # print(res.cookies.values())   获取登陆的所有session

    2、requests库的session对象还能为我们提供请求方法的缺省数据,通过设置session对象的属性来实现
    eg:

    # 创建一个session对象  

    s = requests.Session()  

    # 设置session对象的auth属性,用来作为请求的默认参数  

    s.auth = ('user', 'pass')  

    # 设置session的headers属性,通过update方法,将其余请求方法中的headers属性合并起来作为最终的请求方法的headers  

    s.headers.update({'x-test': 'true'})  

    # 发送请求,这里没有设置auth会默认使用session对象的auth属性,这里的headers属性会与session对象的headers属性合并  

    r = s.get('http://httpbin.org/headers', headers={'x-test2': 'true'})  

    上面的请求数据等于:{'Authorization': 'Basic dXNlcjpwYXNz', 'x-test': 'false'}

    # 查看发送请求的请求头  

    r.request.headers      #打印响应中请求的所有header数据

    res3 = s.get("http://pre.n.cn/irs-web/sso/login",cookies = cookie)
    print(res3.request.headers.get("Cookie").split("IRSSID=")[-1])
    print(type(res3.request.headers.get("Cookie").split("IRSSID=")[-1]))
    print(res3.request._cookies)
    
    

    Python:requests:详解超时和重试

     

    网络请求不可避免会遇上请求超时的情况,在 requests 中,如果不设置你的程序可能会永远失去响应。
    超时又可分为连接超时和读取超时。

    连接超时

    连接超时指的是在你的客户端实现到远端机器端口的连接时(对应的是connect()),Request 等待的秒数。
    import time
    import requests
    
    url = 'http://www.google.com.hk'
    
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    try:
        html = requests.get(url, timeout=5).text
        print('success')
    except requests.exceptions.RequestException as e:
        print(e)
    
    print(time.strftime('%Y-%m-%d %H:%M:%S'))

    因为 google 被墙了,所以无法连接,错误信息显示 connect timeout(连接超时)。

    2019-10-25 15:03:40
    HTTPConnectionPool(host='www.google.com.hk', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x02DC4650>, 'Connection to www.google.com.hk timed out. (connect timeout=5)'))
    2019-10-25 15:03:46

    就算不设置,也会有一个默认的连接超时时间(我测试了下,大概是21秒)。

    读取超时

    读取超时指的就是客户端等待服务器发送请求的时间。(特定地,它指的是客户端要等待服务器发送字节之间的时间。在 99.9% 的情况下这指的是服务器发送第一个字节之前的时间)。

    简单的说,连接超时就是发起请求连接到连接建立之间的最大时长,读取超时就是连接成功开始到服务器返回响应之间等待的最大时长。

    如果你设置了一个单一的值作为 timeout,如下所示:

    r = requests.get('https://github.com', timeout=5)

    这一 timeout 值将会用作 connect 和 read 二者的 timeout。如果要分别制定,就传入一个元组:

    r = requests.get('https://github.com', timeout=(3.05, 27))
    

    黑板课爬虫闯关的第四关正好网站人为设置了一个15秒的响应等待时间,拿来做说明最好不过了。 

    #!/usr/bin/python
    #coding=utf-8
    import time
    import requests

    url_login = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03/'

    session = requests.Session()
    session.get(url_login)

    token = session.cookies['csrftoken']
    session.post(url_login, data={'csrfmiddlewaretoken': token, 'username': 'xx', 'password': 'xx'})

    print(time.strftime('%Y-%m-%d %H:%M:%S'))

    url_pw = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/'
    try:
    html = session.get(url_pw, timeout=(5, 10)).text
    #html = session.get(url_pw, timeout=(5, 1)).text
    print('success')
    except requests.exceptions.RequestException as e:
    print(e)

    print(time.strftime('%Y-%m-%d %H:%M:%S'))

    错误信息中显示的是 read timeout(读取超时)。

    2019-10-25 15:07:46
    HTTPConnectionPool(host='www.heibanke.com', port=80): Read timed out. (read timeout=1)
    2019-10-25 15:07:48

    读取超时是没有默认值的,如果不设置,程序将一直处于等待状态。我们的爬虫经常卡死又没有任何的报错信息,原因就在这里了。

    超时重试

    一般超时我们不会立即返回,而会设置一个三次重连的机制。

    def gethtml(url):
        i = 0
        while i < 3:
            try:
                html = requests.get(url, timeout=5).text
                return html
            except requests.exceptions.RequestException:
                i += 1

    其实 requests 已经帮我们封装好了。(但是代码好像变多了...)

    import time
    import requests
    from requests.adapters import HTTPAdapter
    
    s = requests.Session()
    s.mount('http://', HTTPAdapter(max_retries=3))
    s.mount('https://', HTTPAdapter(max_retries=3))
    
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    try:
        r = s.get('http://www.google.com.hk', timeout=5)
        print r.text
    except requests.exceptions.RequestException as e:
        print(e)
    print(time.strftime('%Y-%m-%d %H:%M:%S'))

    max_retries 为最大重试次数,重试3次,加上最初的一次请求,一共是4次,所以上述代码运行耗时是20秒而不是15秒

    2019-10-25 15:12:36
    HTTPConnectionPool(host='www.google.com.hk', port=80): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x02DA4B50>, 'Connection to www.google.com.hk timed out. (connect timeout=5)'))
    2019-10-25 15:12:56

    用GREQUESTS实现并发HTTP请求

    起因

    要用http请求探测服务的有效性,多进程,多线程,感觉似乎没有必要,看看有没有协程的方案

    1. 简单用法

    grequests 利用 requests和gevent库,做了一个简单封装,使用起来非常方便

    import grequests
    import time
    import requests
    urls = [
        'https://docs.python.org/2.7/library/index.html',
        'https://docs.python.org/2.7/library/dl.html',
        'http://www.iciba.com/partial',
        'http://2489843.blog.51cto.com/2479843/1407808',
        'http://blog.csdn.net/woshiaotian/article/details/61027814',
        'https://docs.python.org/2.7/library/unix.html',
        'http://2489843.blog.51cto.com/2479843/1386820',
        'http://www.bazhuayu.com/tutorial/extract_loop_url.aspx?t=0',
    ]
    
    def method1():
        t1 = time.time()
        for url in urls:
            res = requests.get(url)
            #print res.status_code
    
        t2 = time.time()
        print 'method1', t2 - t1
    
    def method2():
        tasks = [grequests.get(u) for u in urls]
        t1 = time.time()
        res = grequests.map(tasks, size=3)
    #    print res
        t2 = time.time()
        print 'method2', t2 - t1
    
    def method3():
        tasks = [grequests.get(u) for u in urls]
        t1 = time.time()
        res = grequests.map(tasks, size=6)
    #    print res
        t2 = time.time()
    
    if __name__ == '__main__':
        method1()
        method2()
        method3()

    运行结果如下:

    method1 8.51106119156
    method2 5.77834510803
    method3 2.55373907089

    可以看到使用协程以后,整个程序的完成时间有了大大缩短,并且每个协程的并发粒度也会影响整体时间

    2. 重要参数

    这里需要补充的是几个
    grequests

    def grequests.map(requests, stream=False, size=None, exception_handler=None, gtimeout=None)

     另外,由于grequests底层使用的是requests,因此它支持
    GET,OPTIONS, HEAD, POST, PUT, DELETE 等各种http method
    所以以下的任务请求都是支持的

    grequests.post(url, json={"name":"zhangsan"}) 
    grequests.delete(url) 

    3. 事件钩子

    grequests的底层库,是requests,因此它也支持事件钩子

    def print_url(r, *args, **kwargs):
            print(r.url)
    
    url = "http://www.baidu.com"
    # 1.
    res = requests.get(url, hooks={"response":print_url})
    
    # 2.
    tasks = []
    req = grequests.get(url, callback=print_url)
    tasks.append(req)
    res = grequests.map(tasks)

  • 相关阅读:
    try,except,finally的用法
    python实现蓝牙通信
    分布式全局ID的几种生成方案
    为什么要两次调用encodeURI来解决乱码问题
    jenkins配置到gitlab拉代码
    查看IOS-app证书到期时间
    使用SSH方式实现Git远程连接GitHub/gitlab
    Git 分支
    jenkins构建后操作archive the artfacts的用法
    MAC 安装jenkins
  • 原文地址:https://www.cnblogs.com/leijiangtao/p/11738038.html
Copyright © 2011-2022 走看看