zoukankan      html  css  js  c++  java
  • 二 . 爬虫 requests模块使用 urllib模块 和 请求响应相关参数

    一 . requests模块使用 和 请求响应相关参数

    https://www.cnblogs.com/wupeiqi/articles/6283017.html

    1. requests  get请求相关参数

    import requests
    
    url = 'http://httpbin.org/get?name=bob'   # 请求url,?后面拼接的是参数
    
    params = {'name': 'nick','age': '18'}   # 参数,与url上的参数同时存在,没有优先级,若key相同,则值以列表形式存在
    
    cookies = {'xxx': '111','yyy': '222'}  # cookie值,若headers中有cookie,则使用headers中的cookie
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",  # 若未设置User-Agent,则会检测为python请求
        "Cookie": 'aaa=aaa;bbb=bbb'
    }
    
    proxies = {'http':'http://ip:端口'}  # 代理地址,固定的格式,如果是https就用https
    
    timeout = 0.5  # 设置超时时间,若请求超出时间则报错
    
    allow_redirects = True  # 重定向中使用,是否允许跳转
    
    res = requests.get(url=url,headers=headers,params=params,cookies=cookies,timeout=timeout,allow_redirects=allow_redirects)
    print(res.text)
    
    
    {
      "args": {
        "age": "18", 
        "name": [
          "bob", 
          "nick"
        ]
      }, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Cookie": "aaa=aaa;bbb=bbb", 
        "Host": "httpbin.org", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
      }, 
      "origin": "117.172.254.245, 117.172.254.245", 
      "url": "https://httpbin.org/get?name=bob&name=nick&age=18"
    }

    2. requests  post请求相关参数

    import requests
    
    url = 'http://httpbin.org/post'  # 请求url
    data = {
        'name': 'nick',  # form数据
        'age': '18',
    }
    json = {"sex":'man'}  # json格式数据,如果有data时,json为null
    files = {'file':open('aa','rt',encoding='utf8')}  # 文件数据
    
    cookies = {
        'xxx': 'xxx',
        'yyy': 'yyy'
    }
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
        "Cookie": 'aaa=aaa;bbb=bbb'
    }
    timeout = 0.5  # 超时时间
    allow_redirects = True  # 是否允许重定向
    res = requests.post(url=url, headers=headers, data=data,cookies=cookies,json=json,files=files)
    print(res.text)
    
    
    
    {
      "args": {}, 
      "data": "", 
      "files": {
        "file": "1111111111111111111111111111u5a03u5a03u8ba4u4e3au4eba"
      }, 
      "form": {
        "age": "18", 
        "name": "nick"
      }, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Content-Length": "356", 
        "Content-Type": "multipart/form-data; boundary=e4ee34734e2325fdc6fa1eb84d070882", 
        "Cookie": "aaa=aaa;bbb=bbb", 
        "Host": "httpbin.org", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
      }, 
      "json": null, 
      "origin": "117.172.254.245, 117.172.254.245", 
      "url": "https://httpbin.org/post"
    }

    3.  request请求

    import requests
    requests.request(method,url,**kwargs)  # method为请求方式,url请求地址。get,post请求本质上是继承了request请求

    4. 响应相关参数

    import requests
    r = requests.get(....)
    
    r.url  # 请求的url
    
    r.text  # 获得响应体文本信息
    
    r.encoding = 'gbk'  # 设置编码方式,用来解决乱码
    
    r.content  # 二进制信息
    
    r.json  # 相当于json.loads(r.text),若返回的不是json数据,会报错
    
    r.status_code  # 响应状态码
    
    r.headers  # 响应头
    
    r.cookies  # 拿cookie
    
    r.history  # 有重定向时,取到的是 [响应对象1,响应对象2...]

    5. 自动保存cookie的请求

    session = requests.session()   
    r = session.get(...)  # 会将cookie保存在seesion中,发次发请求时会带上cookie
    
    # 补充(保存cookie到本地)
    import http.cookiejar as cookiejar
    import requests
    session = requests.session()
    session.cookies = cookiejar.LWPCookieJar()
    
    session.cookies.load(filename='cookie.txt')  # 取cookie
    
    res = session.get('http://www.baidu.com')
    session.cookies.save(filename='cookie.txt')  # 存cookie





    import requests
    session = requests.Session()
    i1 = session.get(url="http://dig.chouti.com/help/service")
    i2 = session.post(
        url="http://dig.chouti.com/login",
        data={
            'phone': "8615131255089",
            'password': "xxooxxoo",
            'oneMonth': ""
        }
    )
    i3 = session.post(
        url="http://dig.chouti.com/link/vote?linksId=8589523"
    )
    print(i3.text)
    # 1. 方法
        requests.get
        requests.post 
        requests.put 
        requests.delete 
        ...
        requests.request(method='POST')
    
    # 2. 参数
    
        2.1  url
        2.2  headers
        2.3  cookies
        2.4  params
        2.5  data,传请求体
                
                requests.post(
                    ...,
                    data={'user':'alex','pwd':'123'}
                )
                
                GET /index http1.1
    host:c1.com
    
    user=alex&pwd=123
                
        2.6  json,传请求体
                requests.post(
                    ...,
                    json={'user':'alex','pwd':'123'}
                )
                
                GET /index http1.1
    host:c1.com
    Content-Type:application/json
    
    {"user":"alex","pwd":123}
        2.7 代理 proxies
            # 无验证
                proxie_dict = {
                    "http": "61.172.249.96:80",
                    "https": "http://61.185.219.126:3128",
                }
                ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict)
                
            
            # 验证代理
                from requests.auth import HTTPProxyAuth
                
                proxyDict = {
                    'http': '77.75.105.165',
                    'https': '77.75.106.165'
                }
                auth = HTTPProxyAuth('用户名', '密码')
                
                r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth)
                print(r.text)
        -----------------------------------------------------------------------------------------
        2.8 文件上传 files
            # 发送文件
                file_dict = {
                    'f1': open('xxxx.log', 'rb')
                }
                requests.request(
                    method='POST',
                    url='http://127.0.0.1:8000/test/',
                    files=file_dict
                )
                
        2.9 认证 auth
        
            内部:
                用户名和密码,用户和密码加密,放在请求头中传给后台。
                
                    - "用户:密码"
                    - base64("用户:密码")
                    - "Basic base64("用户|密码")"
                    - 请求头:
                        Authorization: "basic base64("用户|密码")"
                
            from requests.auth import HTTPBasicAuth, HTTPDigestAuth
    
            ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
            print(ret.text)
            
        2.10 超时 timeout 
            # ret = requests.get('http://google.com/', timeout=1)
            # print(ret)
        
            # ret = requests.get('http://google.com/', timeout=(5, 1))
            # print(ret)
            
        2.11 允许重定向  allow_redirects
            ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
            print(ret.text)
            
        2.12 大文件下载 stream
            from contextlib import closing
            with closing(requests.get('http://httpbin.org/get', stream=True)) as r1:
            # 在此处理响应。
            for i in r1.iter_content():
                print(i)
                
        2.13 证书 cert
            - 百度、腾讯 => 不用携带证书(系统帮你做了)
            - 自定义证书
                requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
                requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key"))
        2.14 确认 verify =False 
    
    
    
    requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")

    二 .urllib模块使用 和 请求响应相关参数

    https://www.cnblogs.com/meipu/p/11181754.html

    https://www.cnblogs.com/zhangxinqi/p/9170312.html

    其主要包括一下模块:
    
    urllib.request 请求模块
    
    urllib.error 异常处理模块
    
    urllib.parse url解析模块
    
    urllib.robotparser robots.txt解析模块
    import urllib.request
    
    response = urllib.request.urlopen('https://www.python.org')
    print(response.read().decode('utf-8'))
    
    print(response.status)
    print(response.getheaders())
    print(response.getheader('Server'))
    
    
    
    
    print("#####################################################################################3333")
    
    
    
    from urllib import request, parse
    
    url = 'http://httpbin.org/post'
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
        'Host': 'httpbin.org'
    }
    dict = {
        'name': 'Germey'
    }
    data = bytes(parse.urlencode(dict), encoding='utf8')
    req = request.Request(url=url, data=data, headers=headers, method='POST')
    response = request.urlopen(req)
    print(response.read().decode('utf-8'))
  • 相关阅读:
    20000字干货笔记,一天搞定Mysql~【转】
    Linux操作系统概述及内核介绍
    如何在装有高版本NBU的主机上安装低版本的NBU?卸载8.0安装7.5记录
    vmware+kvm+vnc安装配置
    NBU异机恢复Oracle数据库,作业报错2850处理
    NetBackup 进程整理
    1、虚拟化实施流程、宿主机如何选型、如何进行性能测试
    灾难恢复的衡量指标RTO和RPO
    国内主要灾备厂商
    单例设计模式
  • 原文地址:https://www.cnblogs.com/lovershowtime/p/11771338.html
Copyright © 2011-2022 走看看