zoukankan      html  css  js  c++  java
  • 爬虫之requests模块的基础使用等相关内容-133

    1 爬虫基本原理

    1 百度是个大爬虫
    2 模拟浏览器发送http请求---(请求库)(频率,cookie,浏览器头。。js反扒,app逆向)(抓包工具)-----》从服务器取回数据-----》解析数据--(解析库)(反扒)----》入库(存储库,)
    3 爬虫协议:

    2 requests模块

    0 urllib 内置库,发送http请求,比较难用,requests是基于这个库的写的
    1 requests,应用非常广泛的请求库
    2 request-html库(request,bs4,lxml等二次封装)

    3 User-Agent:请求头中标志是什么客户端发送的请求
    4 Referer:上次请求的地址

     

    2.1 发送get请求

    1 携带数据,携带头,携带cookie。。。
    import requests
    # res=requests.get('https://www.cnblogs.com/')
    # # print(res.text) # 文本内容
    # print(res.content) # 二进制内容 视频,图片 res.iter_content()


    ## 携带参数
    # header={
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    # }
    #https://www.sogou.com/web?query=%E7%87%95%E7%AA%9D
    # # res=requests.get('https://www.baidu.com/s?wd=帅哥',headers=header)
    # res=requests.get('https://www.baidu.com/s',headers=header,params={'wd':'老奶奶'})
    #
    # with open('baidu.html','wb') as f:
    #     f.write(res.content)
    # # print(res.text)


    #### url编码和解码
    from urllib.parse import urlencode,unquote
    # wd='燕窝'
    # encode_res=urlencode({'k':wd},encoding='utf-8')
    # print(encode_res)

    res=unquote('%E7%87%95%E7%AA%9D')
    print(res)



    ### 携带头
    '''
    Host :
    Referer :大型网站通常都会根据该参数判断请求的来源
    User-Agent: 客户端
    Cookie :Cookie信息虽然包含在请求头里,但requests模块有单独的参数来处理他,headers={}内就不要放它了
    # cookie携带的两种方式,如果是字符串形式,就放在请求头中
    如果是字典,cookiejar的对象,就放在参数中cookies=字典,cookiejar的对象
    '''

    # header={
    #     'Cookie':'BIDUPSID=BAA91359911D4514C92ABD22B88961AE; PSTM=1601026753; MCITY=-289%3A; BD_UPN=12314753; sug=3; sugstore=0; ORIGIN=0; bdime=0; H_WISE_SIDS=164576_164894_161282_165717_163389_156287_163806_159613_162897_155226_165552_151532_165328_159530_159937_162434_160878_161125_164299_164987_164692_127969_156929_164163_160246_165292_164941_163979_165236_164621_160276_131423_164148_128701_165203_165424_161568_107312_164812_163412_165375_160574_161965_161674_163270_144966_164259_162186_154213_161237_158640_160980_164131_161885_162268_161771_164451_162643_165471_162156_110085_162020_163569_163567_164960_165074_163274_164128_165144_165155_165647_165711; __yjs_duid=1_5cf7a7f565d2f9cacef83d808d8eb0c01610527765679; BAIDUID=EA290B9A7A371E91762170324A4ED8C0:FG=1; BDSFRCVID=R60OJexroG3VeHne34XvvrEMkvqMFyTTDYLEJs2qYShnrsPVJeC6EG0PtoWQkz--EHtdogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=JJkO_D_atKvjDbTnMITHh-F-5fIX5-RLfbreBh7F54nKDp0R3xRi0tQyhJJZ-Jj4BC3MahvKQDOxsMTsQqOr34tn2foyKt6lQe_JLqTN3KJmfKn1bUbA5DrXWxvM2-biWbRM2MbdJqvP_IoG2Mn8M4bb3qOpBtQmJeTxoUJ25DnJhhCGe6t5D5J0jN-s-bbfHDJK0b7aHJOoDDv40MQcy4LbKxnxJhKLtIo-oKbob4QKsq6cbURvD--g3-OkWUQ7Q2QCBI3mtJ3KsxQTyb3HQfbQ0hOyKfjxX5ILaKbH2R7JOpkxbUnxy5KUQRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ut6IDfKbKWbvqajrDHJTg5DTjhPrM0GDjWMT-MTryKKOC5ln_eD8mWbj-KxLIMpjDWx58QNnRhlRNWt8KqfTbM-TKKp8ZyxomtfQxtNRJWM3l2-FVKq5S5-OobUPUDMJ9LUkqW2cdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj2CKLK-oj-D89ejub3j; ab_sr=1.0.0_MmE0YjM2NDEzZTYyMDc3MzcyOGRmY2FlZTUwODAxYmU0MGZkNTQxMGZjYmIzNGMyYTQwOTY1OGRlMGZjYWI3MjMwYjIxMWM0NGIzNDljZWNmYWI1NDljNmE3YjIyN2Nm; H_PS_PSSID=33423_33440_33344_33284_33286_33395_33463_33414_26350; H_PS_645EC=72dfrtzn7c42D9bDsI6pLpQGO5zBgUgpsueKbELHa2yRddAFDA%2FhwY0FWkg; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598'
    # }
    # res=requests.get('https://www.baidu.com/',cookies='',headers=header)

    2.2 发送post请求

    1 携带数据,携带头,携带cookie
    import requests


    ## post请求携带数据
    # res=requests.post('地址',headers=字典,cookie=对象,params='放在链接中',
    #                   data='字典,放在请求体中',json='json格式字符串,放到请求体中')

    ''''
    #如果我们自定义请求头是application/json,并且用data传值, 则服务端取不到值
    requests.post(url='',
                data={'':1,},
                headers={
                    'content-type':'application/json'
                })

    requests.post(url='',
                json={'':1,},
                ) #默认的请求头:application/json
    '''


    ## session对象,自动处理cookie,不需要人为处理cookie
    ## 一旦登录成功,以后,不需要手动携带cookie了,直接使用session对象发送请求,会自动携带


    # session=requests.session()
    #
    # data = {
    #     'username': '616564099@qq.com',
    #     'password': 'lqz123',
    #     'captcha': '123',
    #     'remember': 1,
    #     'ref': 'http://www.aa7a.cn/',
    #     'act': 'act_login',
    # }
    # header = {
    #     'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    # }
    # res = session.post('http://www.aa7a.cn/user.php', headers=header,data=data)
    #
    #
    # res=session.get('http://www.aa7a.cn/')
    # print('616564099@qq.com' in res.text)

    2.3 高级用法

    1 使用代理,上传文件,超时设置

     

    import requests
    ### SSL Cert Verification(了解即可)

    ## 方式一
    # respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
    #
    # # 手动携带证书
    # import requests
    # respone=requests.get('https://www.12306.cn',
    #                     cert=('/path/server.crt',
    #                           '/path/key'))
    # print(respone.status_code)


    #超时设置
    # import requests
    # respone=requests.get('https://www.baidu.com',
    #                     timeout=0.01)
    # print(respone.text)

    # 认证设置(了解)
    # import requests
    # from requests.auth import HTTPBasicAuth
    # r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
    # print(r.status_code)


    # 异常处理
    # import requests
    # from requests.exceptions import * #可以查看requests.exceptions获取异常类型
    #
    # try:
    #     r=requests.get('http://www.baidu.com',timeout=0.00001)
    # # except ReadTimeout:
    # #     print('===:')
    # # except ConnectionError: #网络不通
    # #     print('-----')
    # # except Timeout:
    # #     print('aaaaa')
    #
    # except Exception as e:
    #     print(e)



    ## 上传文件
    # import requests
    # files={'file':open('a.jpg','rb')}
    # respone=requests.post('http://httpbin.org/post',files=files)
    # print(respone.status_code)


    ## 代理设置

    import requests
    # 免费代理(不稳定)
    # 收费代理(稳定)
    proxies={
       'HTTP':'117.95.200.239:9999',
    }
    respone=requests.get('https://www.12306.cn',
                        proxies=proxies)

    print(respone.status_code)

    # 写一个django,取出访问者的ip地址,使用requests,加代理模块访问

    # 高匿和透明
    # 高匿:服务端取不到真实的ip
    # 透明:服务端可以取到真实的ip地址 请求头中:X-Forwarded-For   Meta

    # 代理池:搞一堆代理,放到列表中,每次发请求,随机出一个(开源代理池)

     

     

    2.4 响应对象的方法

    import requests
    respone=requests.get('https://www.autohome.com.cn/shanghai/')
    # respone属性
    # print(respone.text) # 文本内容
    # print(respone.content) # 二进制

    # print(respone.status_code) # 状态码
    # print(respone.headers)   # 响应头
    # print(type(respone.cookies) )   # cookie对象 RequestsCookieJar
    from requests.cookies import RequestsCookieJar
    # print(respone.cookies.get_dict()) # cookie对象转成字典
    # print(respone.cookies.items())



    # print(respone.url)   # 请求地址
    # print(respone.history) # 当你访问一个网站,有30x,重定向之前的地址,

    print(respone.encoding) # 网站编码

    # respone.encoding='gb2312'
    # print(respone.text)
    #
    # #关闭:response.close()
    # from contextlib import closing
    # with closing(requests.get('xxx',stream=True)) as response:
    #     for line in response.iter_content():
    #     pass

    ## 解析json


    # import json
    # json.loads(respone.text)
    # respone.json()

    3 爬取梨视频

    # pip3 install requests

    import requests
    import re
    # 模拟发送http请求
    # res对象是响应对象
    # res=requests.get('https://www.pearvideo.com/category_8')
    # https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start=0
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=8&start=0')
    # 响应的文本内容
    # print(res.text)

    video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
    print(video_list)
    for video in video_list:
       video_id=video.split('_')[1]
       #https://www.pearvideo.com/videoStatus.jsp?contId=1716693
       video_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + video_id
       header={
           'Referer':'https://www.pearvideo.com/'+video
      }
       res=requests.get(video_url,headers=header)
       real_mp4=res.json()['videoInfo']['videos']['srcUrl']

       real_mp42=real_mp4.replace(real_mp4.split('/')[-1].split('-')[0],'cont-%s'%video_id)
       print(real_mp42)
       res=requests.get(real_mp42)
       name=real_mp42.split('/')[-1]
       with open('video/%s'%name,'wb') as f:
           for line in res.iter_content():
               f.write(line)


    # https://video.pearvideo.com/mp4/adshort/20210118/ cont-1716868   -15578857_adpkg-ad_hd.mp4 可以播放
    # https://video.pearvideo.com/mp4/adshort/20210118/ 1611024074140   -15578857_adpkg-ad_hd.mp4 不可以播放

     

    4 自动登录某网站


    import requests
    # res=requests.get('http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2F')
    # cookie=res.cookies
    data = {
       'username': '616564099@qq.com',
       'password': 'lqz123',
       'captcha': '123',
       'remember': 1,
       'ref': 'http://www.aa7a.cn/',
       'act': 'act_login',
    }
    header = {
       'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    }
    res = requests.post('http://www.aa7a.cn/user.php', headers=header,data=data)
    # print(res.text)
    cookie=res.cookies  # 登录成功拿到cookie----》模拟下单了

    res=requests.get('http://www.aa7a.cn/',cookies=cookie)
    print('616564099@qq.com' in res.text)  #

     

  • 相关阅读:
    Lock和synchronized的区别和使用(转发)
    redis集群配置
    分布式之redis(转发)
    拉格朗日乘法与KKT条件
    骨骼动画原理
    常用非线性优化算法总结
    广义线性回归模型(三)
    线性模型、最优化方法(二)
    矩阵微分基础(一)
    OpenGL坐标系统
  • 原文地址:https://www.cnblogs.com/usherwang/p/14299507.html
Copyright © 2011-2022 走看看