zoukankan      html  css  js  c++  java
  • 爬虫

    wusir

    requests模块

    import requests
    
    """
    # 1. 方法
        requests.get
        requests.post 
        requests.put 
        requests.delete 
        ...
        requests.request(method='POST')
    """
    # 2. 参数
    """
        2.1  url
        2.2  headers
        2.3  cookies
        2.4  params
        2.5  data,传请求体
                
                requests.post(
                    ...,
                    data={'user':'alex','pwd':'123'}
                )
                
                GET /index http1.1
    host:c1.com
    
    user=alex&pwd=123
                
        2.6  json,传请求体
                requests.post(
                    ...,
                    json={'user':'alex','pwd':'123'}
                )
                
                GET /index http1.1
    host:c1.com
    Content-Type:application/json
    
    {"user":"alex","pwd":123}
        2.7 代理 proxies
            # 无验证
                proxie_dict = {
                    "http": "61.172.249.96:80",
                    "https": "http://61.185.219.126:3128",
                }
                ret = requests.get("https://www.proxy360.cn/Proxy", proxies=proxie_dict)
                
            
            # 验证代理
                from requests.auth import HTTPProxyAuth
                
                proxyDict = {
                    'http': '77.75.105.165',
                    'https': '77.75.106.165'
                }
                auth = HTTPProxyAuth('用户名', '密码')
                
                r = requests.get("http://www.google.com",data={'xxx':'ffff'} proxies=proxyDict, auth=auth)
                print(r.text)
        #----------------------------------------以下是不太常用的(了解)-------------------------------------------------
        2.8 文件上传 files
            # 发送文件
                file_dict = {
                    'f1': open('xxxx.log', 'rb')  #本地问文件
                }
                requests.request(
                    method='POST',
                    url='http://127.0.0.1:8000/test/',
                    files=file_dict
                )
                
        2.9 认证 auth
        
            内部:
                用户名和密码,用户和密码加密,放在请求头中传给后台。
                
                    - "用户:密码"
                    - base64("用户:密码")                            #加密
                    - "Basic base64("用户:密码")"                    #构造字符串
                    - 请求头:
                        Authorization: "Basic base64("用户|密码")"  #把构造的字符串放入请求头中
                
            from requests.auth import HTTPBasicAuth, HTTPDigestAuth
    
            ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))
            #执行HTTPBasicAuth时会调用__call__方法,然后走上面的流程
            print(ret.text)
            
        2.10 超时 timeout 
            # ret = requests.get('http://google.com/', timeout=1)
            # print(ret)
        
            # ret = requests.get('http://google.com/', timeout=(5, 1))
            # print(ret)
            
        2.11 允许重定向  allow_redirects
            ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
            print(ret.text)
            
        2.12 大文件下载 stream
            from contextlib import closing
            with closing(requests.get('http://httpbin.org/get', stream=True)) as r1:
            # 在此处理响应。
            for i in r1.iter_content():
                print(i)
    #---------------------------下面的暂时用不到,知道就行---------------------------------------------          
        2.13 证书 cert
            - 百度、腾讯 => 不用携带证书(系统帮你做了)
            - 自定义证书
                requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
                requests.get('http://127.0.0.1:8000/test/', cert=("xxxx/xxx/xxx.pem","xxx.xxx.xx.key"))
        2.14 确认 verify =False 
    """
    
    
    requests.get('http://127.0.0.1:8000/test/', cert="xxxx/xxx/xxx.pem")
    requsets模块

    requests中的session:

    - session(推荐自己携带cookie)
    以前的cookie是我们自己携带的,每次请求都需要携带,在requests模块中有一个叫session的东西
    在seesion的内部会把我们访问的cookie全都保存下来,也会把header保存下来,也就是会把响应头和请求头全都保存下来,
    我们在发请求的时候就不需要自己携带cookie了,它的内部会自动帮我们带上。
    用法如下:    
    
        session = requests.Session()
        
        session.get()
        session.post()

    示例:

    # ######################爬取汽车之家#####################################
    
    '''
    import requests
    #import requests:伪造浏览器发起Http请求
    
    from bs4 import BeautifulSoup
    # pip3 install BeautifulSoup4
    # BeautifulSoup  将html格式的字符串解析成对象。         对象.find/find_all
    
    response = requests.get("https://www.autohome.com.cn/news/")
    response.encoding = 'gbk'
    #传输之间默认用的是字节,把字节转化为字符串,指定字符编码
    
    soup = BeautifulSoup(response.text,'html.parser') #把html格式的字符串解析成soup对象
    
    div = soup.find(name='div',attrs={'id':'auto-channel-lazyload-article'})  #find找到是匹配成功的第一个数据
    
    li_list = div.find_all(name='li') #找到div标签下的所有li
    
    for li in li_list:
    
        title = li.find(name='h3')
        if not title:
            continue
        p = li.find(name='p')
        a = li.find(name='a')
    
        print(title.text)           #找h3标签对象里面的文本内容    标题
        print(a.attrs.get('href'))  #找a标签对象里面的属性      链接
        print(p.text)               #找p标签对象里面的文本内容      简介
    
        img = li.find(name='img')
        src = img.get('src')
        src = "https:" + src
        print(src)
    
        # 再次发起请求,下载图片
        file_name = src.rsplit('/',maxsplit=1)[1] #拿到文件名
        ret = requests.get(src)
        with open(file_name,'wb') as f:
            f.write(ret.content)   # .content是返回的二进制
    
    '''
    
    # ################################### 示例一:爬取数据(携带请起头) ###################################
    """ import requests from bs4 import BeautifulSoup r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) soup = BeautifulSoup(r1.text,'html.parser') # 标签对象 content_list = soup.find(name='div',id='content-list') # print(content_list) # [标签对象,标签对象] item_list = content_list.find_all(name='div',attrs={'class':'item'}) for item in item_list: a = item.find(name='a',attrs={'class':'show-content color-chag'}) print(a.text.strip()) # print(a.text) """

    # ################################### 示例二:点赞 ################################### #访问网站页面时会返回一个cookie,当登陆的时候会携带这个cookie(登陆时返回给我们的cookie是假的) import requests # 1. 查看首页 r1 = requests.get( url='https://dig.chouti.com/', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } ) # 2. 提交用户名和密码 r2 = requests.post( url='https://dig.chouti.com/login', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, data={ 'phone':'8613121758648', 'password':'woshiniba', 'oneMonth':1 }, cookies=r1.cookies.get_dict() ) # 3. 点赞 r3 = requests.post( url='https://dig.chouti.com/link/vote?linksId=20435396', headers={ 'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' }, cookies=r1.cookies.get_dict() ) print(r3.text)
    """"""
    # ################################### 示例三:自动登录GitHub ###################################
    # 1. GET,访问登录页面
    """
    - 去HTML中找隐藏的Input标签获取csrf token
    - 获取cookie
    """
    
    # 2. POST,用户名和密码
    """
    - 发送数据:
        - csrf
        - 用户名
        - 密码
    - 携带cookie
    """
    
    # 3. GET,访问https://github.com/settings/emails
    """
    - 携带 cookie
    """
    
    import requests
    from bs4 import BeautifulSoup
    
    # ############## 方式一 ##############
    #
    # # 1. 访问登陆页面,获取 authenticity_token
    # i1 = requests.get('https://github.com/login')
    # soup1 = BeautifulSoup(i1.text, features='lxml')
    # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
    # authenticity_token = tag.get('value')
    # c1 = i1.cookies.get_dict()
    # i1.close()
    #
    # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证
    # form_data = {
    # "authenticity_token": authenticity_token,
    #     "utf8": "",
    #     "commit": "Sign in",
    #     "login": "wupeiqi@live.com",
    #     'password': 'xxoo'
    # }
    #
    # i2 = requests.post('https://github.com/session', data=form_data, cookies=c1)
    # c2 = i2.cookies.get_dict()
    # c1.update(c2)
    # i3 = requests.get('https://github.com/settings/repositories', cookies=c1)
    #
    # soup3 = BeautifulSoup(i3.text, features='lxml')
    # list_group = soup3.find(name='div', class_='listgroup')
    #
    # from bs4.element import Tag
    #
    # for child in list_group.children:
    #     if isinstance(child, Tag):
    #         project_tag = child.find(name='a', class_='mr-1')
    #         size_tag = child.find(name='small')
    #         temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
    #         print(temp)
    
    
    # ############## 方式二 ##############
    # session = requests.Session()
    # # 1. 访问登陆页面,获取 authenticity_token
    # i1 = session.get('https://github.com/login')
    # soup1 = BeautifulSoup(i1.text, features='lxml')
    # tag = soup1.find(name='input', attrs={'name': 'authenticity_token'})
    # authenticity_token = tag.get('value')
    # c1 = i1.cookies.get_dict()
    # i1.close()
    #
    # # 1. 携带authenticity_token和用户名密码等信息,发送用户验证
    # form_data = {
    #     "authenticity_token": authenticity_token,
    #     "utf8": "",
    #     "commit": "Sign in",
    #     "login": "wupeiqi@live.com",
    #     'password': 'xxoo'
    # }
    #
    # i2 = session.post('https://github.com/session', data=form_data)
    # c2 = i2.cookies.get_dict()
    # c1.update(c2)
    # i3 = session.get('https://github.com/settings/repositories')
    #
    # soup3 = BeautifulSoup(i3.text, features='lxml')
    # list_group = soup3.find(name='div', class_='listgroup')
    #
    # from bs4.element import Tag
    #
    # for child in list_group.children:
    #     if isinstance(child, Tag):
    #         project_tag = child.find(name='a', class_='mr-1')
    #         size_tag = child.find(name='small')
    #         temp = "项目:%s(%s); 项目路径:%s" % (project_tag.get('href'), size_tag.string, project_tag.string, )
    #         print(temp)
    github

     拉勾网示例:

    ########################拉勾网###############################
    
    '''
    - 密码加密
        - 找js,通过python实现加密方式
        - 找密文,密码<=>密文
    
    - Referer头, 上一次请求地址,可以用于做防盗链。
    
    '''
    
    import re
    import requests
    
    r1 = requests.get(
        url='https://passport.lagou.com/login/login.html',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        }
    )
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    # print(X_Anti_Forge_Token, X_Anti_Forge_Code)
    # print(r1.text)
    #
    r2 = requests.post(
        url='https://passport.lagou.com/login/login.json',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
            'X-Anit-Forge-Code':X_Anti_Forge_Code,
            'X-Anit-Forge-Token':X_Anti_Forge_Token,
            'Referer': 'https://passport.lagou.com/login/login.html', # 上一次请求地址是什么?
        },
        data={
            "isValidate": True,
            'username': '15269853268',
            'password': 'ab18d328d7126ea65915c50359c22c0d',
            'request_form_verifyCode': '',
            'submit': ''
        },
        cookies=r1.cookies.get_dict()
    )
    print(r2.text)

    小节:

    请求头:
        user-agent
        referer
        host
        cookie
        特殊请起头,查看上一次请求获取内容。
            'X-Anit-Forge-Code':...
            'X-Anit-Forge-Token':...
    请求体:
        - 原始数据
        - 原始数据 + token
        - 密文
            - 找算法 
            - 使用密文
            
    套路:
        - post登录获取cookie,以后携带cookie 
        - get获取未授权cookie,post登录携带cookie去授权,以后携带cookie 
  • 相关阅读:
    一次惨痛的debug的经历-RuntimeError: CUDA error: an illegal memory access was encountered
    Rank loss调研
    守护进程 supervisor
    PHP实现异步请求非阻塞
    PHP实现图片和文字水印(PHP给图片添加水印功能)
    虚拟机相关博客
    小师妹学JavaIO之:文件系统和WatchService
    后端 Java ActionEvent getModifiers()
    Java中常见的锁简述
    关键系统的JVM参数推荐
  • 原文地址:https://www.cnblogs.com/zh-xiaoyuan/p/13277669.html
Copyright © 2011-2022 走看看