zoukankan      html  css  js  c++  java
  • python爬虫之request and BeautifulSoup

    1.爬虫的本质是什么?

    模仿浏览器的行为,爬取网页信息。

    2.requests

    1.get请求

    无参数实例
    import requests
    ret = requests.get('https://github.com/timeline.json')
    print ret.text
      
      
    有参数实例
    import requests
    ret = requests.get("http://httpbin.org/get", params= {'key1': 'value1', 'key2': 'value2'})
    
    print ret.text
    get

     2.post请求

    import requests
    import json
      
    url = 'https://api.github.com/some/endpoint'
    payload = {'v1': 'k1}
    headers = {'content-type': 'application/json'}
      
    ret = requests.post(url, data=json.dumps(payload), headers=headers)
    
    print ret.text
    View Code

    3.其他请求

    requests.get(url, params=None, **kwargs)
    requests.post(url, data=None, json=None, **kwargs)
    requests.put(url, data=None, **kwargs)
    requests.head(url, **kwargs)
    requests.delete(url, **kwargs)
    requests.patch(url, data=None, **kwargs)
    requests.options(url, **kwargs)
      
    # 以上方法均是在此方法的基础上构建
    requests.request(method, url, **kwargs)
    View Code

    4.更多参数与实例

    • method
      def param_method_url():
          ret=requests.request(method='get', url='http://127.0.0.1:8000/test/')
          ret=requests.request(method='post', url='http://127.0.0.1:8000/test/')
      View Code
    • params
      import requests  
      
        requests.get(url='http://127.0.0.1:8000/test/',
          params={'k1': 'v1', 'k2': 'v2'})
      
      #他的本质与requests.get(url='xxxxx?k1=v1&k2=v2')
      View Code
    • data
          # 可以是字典
          # 可以是字符串
          # 可以是字节
          # 可以是文件对象
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # data={'k1': 'v1', 'k2': '水电费'})
      
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # data="k1=v1; k2=v2; k3=v3; k3=v4"
          # )
      
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # data="k1=v1;k2=v2;k3=v3;k3=v4",
          # headers={'Content-Type': 'application/x-www-form-urlencoded'}
          # )
      
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # data=open('data_file.py', mode='r', encoding='utf-8'), # 文件内容是:k1=v1;k2=v2;k3=v3;k3=v4
          # headers={'Content-Type': 'application/x-www-form-urlencoded'}
          # )
      View Code
    • json
      #如果请求体是 payload的话则需要传入json格式
      requests.request(method='POST',
                           url='http://127.0.0.1:8000/test/',
                           json={'k1': 'v1', 'k2': '水电费'})
      View Code
    • cookies
      ret1 = requests.get(
          url='https://dig.chouti.com/',
          headers={
              'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
          }   )
      ret1_cookies = ret1.cookies.get_dict()
      #获取的ret1.cookies是访问该url返回的cookies对象
      #通过get_dict()获取到字典类型的cookies
      View Code
    • headers

          # 发送请求头到服务器端
          requests.request(method='POST',
                           url='http://127.0.0.1:8000/test/',
                           json={'k1': 'v1', 'k2': '水电费'},
                           headers={'Content-Type': 'application/x-www-form-urlencoded'}
                           )
          #具体需要什么请求头要看服务器端
      View Code
    • files

       # 发送文件
          # file_dict = {
          # 'f1': open('readme', 'rb')
          # }
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # files=file_dict)
      
          # 发送文件,定制文件名
          # file_dict = {
          # 'f1': ('test.txt', open('readme', 'rb'))
          # }
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # files=file_dict)
      
          # 发送文件,定制文件名
          # file_dict = {
          # 'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf")
          # }
          # requests.request(method='POST',
          # url='http://127.0.0.1:8000/test/',
          # files=file_dict)
      
          # 发送文件,定制文件名
          # file_dict = {
          #     'f1': ('test.txt', "hahsfaksfa9kasdjflaksdjf", 'application/text', {'k1': '0'})
          # }
          # requests.request(method='POST',
          #                  url='http://127.0.0.1:8000/test/',
          #                  files=file_dict)
      
          pass
      View Code
    • timeout
       设置超时时间,如果访问超过超时时间就停止访问
      # ret = requests.get('http://google.com/', timeout=1)
          # print(ret)
      
          # ret = requests.get('http://google.com/', timeout=(5, 1))
          # print(ret)
          pass
      View Code
    • allow_redirects
      #是否允许重定向,默认为true
      ret = requests.get('http://127.0.0.1:8000/test/', allow_redirects=False)
          print(ret.text)
      View Code

    BeautifulSoup

    该模块可以将接收到的html和xml进行格式化,通过操作对象的方式快速的找到想要的标签

    1. 使用实例
      from bs4 import BeautifulSoup
       
      html_doc = """
      <html><head><title>The Dormouse's story</title></head>
      <body>
          ...
      </body>
      </html>
      """
       
      soup = BeautifulSoup(html_doc, features="lxml")
      View Code
    2. name--->标签名
      # tag = soup.find('a')
      # name = tag.name # 获取
      # print(name)
      # tag.name = 'span' # 设置
      View Code
    3. attr--->标签属性
      # tag = soup.find('a')
      # attrs = tag.attrs    # 获取
      # print(attrs)
      # tag.attrs = {'ik':123} # 设置
      # tag.attrs['id'] = 'iiiii' # 设置
      View Code
    4. children--->所有子标签
      # body = soup.find('body')
      # v = body.children
      View Code
    5. descendants 所有后代
      # body = soup.find('body')
      # v = body.descendants
      View Code
    6. clear--->将标签的所有子标签全部清空(保留标签名)
      # tag = soup.find('body')
      # tag.clear()
      # print(soup)
      View Code
    7. extract,递归的删除所有的标签,并获取删除的标签
       #body = soup.find('body')
      # v = body.extract()
      # print(soup)
      View Code
    8. find,获取匹配的第一个标签
      # tag = soup.find('a')
      # print(tag)
      # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
      # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
      # print(tag)
      View Code
    9. find_all,获取匹配的所有标签
      # tags = soup.find_all('a')
      # print(tags)
       
      # tags = soup.find_all('a',limit=1)
      # print(tags)
       
      # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
      # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
      # print(tags)
       
       
      # ####### 列表 #######
      # v = soup.find_all(name=['a','div'])
      # print(v)
       
      # v = soup.find_all(class_=['sister0', 'sister'])
      # print(v)
      View Code
    10. has_attr,检查标签是否具有该属性
      # tag = soup.find('a')
      # v = tag.has_attr('id')
      # print(v)
      View Code

    爬取汽车之家实例

    import  requests
    from bs4 import BeautifulSoup
    # 这个模块解析html
    
    
    # 下载页面
    ret = requests.get(url='https://www.autohome.com.cn/news/')
    # print(ret.apparent_encoding)#爬取编码格式
    # print(ret.content)
    # ret.encoding = 'gbk'
    ret.encoding=ret.apparent_encoding
    # print(ret.text)
    
    # 页面解析.获取想要的内容
    soup = BeautifulSoup(ret.text,features='html.parser') # 公司用 lxml(需要单独安装)
    
    # find是匹配成功的第一个
    div =soup.find(name='div',id='auto-channel-lazyload-article')
    
    #如果有class 匹配的时候:
    # *****div = soup.find(name='div',attrs={'class':'dddd','id':'dfa'})*****
    
    li_list=div.find_all(name='li')  # find_all返回的是一个列表 不能够用.find
    
    # print(li_list)
    
    for row in li_list:
        h3=row.find(name='h3')
        if not h3:
            continue
    
        a=row.find(name='a')
        print(a.get('href'))
    
        p = row.find(name='p')
        print(p.text)
    
        li_img= row.find(name='img')
        src= li_img.get('src')
    
        file_name = src.rsplit('__',maxsplit=1)[1]
    
        ret_img = requests.get('https:'+src)
    
        with open(file_name,'wb') as f:
            f.write(ret_img.content)
    View Code

    抽屉实例

    import requests
    from bs4 import BeautifulSoup
    
    # 第一次访问返回未授权的cookie值
    ret1 = requests.get(
        url='https://dig.chouti.com/',
        headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        })
    ret1_cookies = ret1.cookies.get_dict()
    
    # 登录成功之后cookie值已经授权
    ret = requests.post(
        url='https://dig.chouti.com/login',
        data={
            'phone':'8613612201458',
            'password':'wo3384451',
            'oneMonth':'1'
        },
        headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        },
        cookies = ret1_cookies,
    )
    
    for num_page in range(2,10):
    
        ret_index= requests.get(url='https://dig.chouti.com/all/hot/recent/%s'%(num_page),
                                headers={
                                    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
                                },
                                )
        soup = BeautifulSoup(ret_index.text,'html.parser')
    
        div = soup.find(name='div',id='content-list')
    
        item_list = div.find_all(attrs={'class':'part2'})
    
        for item in item_list:
            num = item.get('share-linkid')
    
        # 此时带着已经授权的cookie值去点赞
            ret3 = requests.post(
                url='https://dig.chouti.com/link/vote?linksId=%s'%(num),
                # data={'linksId':'%s'%(num)},
                headers={
                    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
                },
                cookies = ret1_cookies
            )
    
            print(ret3.text)
    View Code 

    Github实例

    import requests
    import re
    from bs4 import BeautifulSoup
    
    class Github(object):
        def __init__(self,username=None,passward=None):
            self.username=username
            self.passward=passward
            self.all_cookies={}
    
            self.process()
    
        def process(self):
            if not (self.username and self.passward):
                raise Exception('请输入用户名和密码')
            self.get_login_key()
    
        def get_login_key(self):
            # 获取authenticity_token
            login_result = requests.get(
                url='https://github.com/login',
                headers={
                    'Host': 'github.com',
                }
            )
            auth_key =BS4xpath.get_auth_key(login_result.text)
            self.all_cookies = login_result.cookies.get_dict()
            self.login(auth_key)
    
        def login(self,auth_key):
            # 登录获取已经登录的cookies
            login_result = requests.post(
                url='https://github.com/session',
                headers={
                    'Upgrade-Insecure-Requests': '1',
                    'Host': 'github.com',
                },
                data={
                    'utf8': '',
                    'authenticity_token':auth_key,
                    'login': self.username,
                    'password': self.passward,
                    'commit': 'Sign in'
                },
                cookies=self.all_cookies
            )
            self.all_cookies.update(login_result.cookies.get_dict())
            if self.all_cookies['logged_in']=='no':
                raise Exception('用户名或密码错误')
        def get_msg(self):
            msg_obj = requests.get(
                url='https://github.com/settings/profile',
                headers={
                    'Host': 'github.com',
                    'Referer': 'https://github.com/',
                },
                cookies=self.all_cookies
            )
            msg=BS4xpath.get_msg_dict(msg_obj.text)
    
            return msg
    
    
    class BS4xpath(object):
    
        @classmethod
        def get_auth_key(self,text):
            soup = BeautifulSoup(text,'html.parser')
            auth_key=soup.find(name='input', attrs={'name': 'authenticity_token'}).get('value')
            return auth_key
    
        @classmethod
        def get_msg_dict(self,text):
            response = {}
            ret2_data = BeautifulSoup(text,'html.parser')
            div = ret2_data.find(name='div', attrs={'class': "column two-thirds"})
            dl_list = div.find_all(name='dl', attrs={'class': "form-group"})
            for row in dl_list:
                rowname = row.find('label').text
                dd_input = row.find('input')
                if dd_input:
                    response[rowname] = dd_input.get('value')
            return response
    
    
    
    obj = Github(username='a3384451',passward='wo3384451')
    
    ret = obj.get_msg()
    print(ret)
    View Code

    拉勾网实例

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    import re
    import requests
    
    all_cookie = {}
    
    # ############### 1. 查看登录页面 ###############
    r1 = requests.get(
        url='https://passport.lagou.com/login/login.html',
        headers={
            'Host': 'passport.lagou.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
        }
    )
    
    all_cookie.update(r1.cookies.get_dict())
    
    X_Anti_Forge_Token = re.findall(r"window.X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall(r"window.X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    
    # ############### 2. 用户名密码登录 ###############
    r2 = requests.post(
        url='https://passport.lagou.com/login/login.json',
        headers={
            'Host': 'passport.lagou.com',
            'Referer': 'https://passport.lagou.com/login/login.html',
            'X-Anit-Forge-Code': X_Anti_Forge_Code,
            'X-Anit-Forge-Token': X_Anti_Forge_Token,
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        },
        data={
            'isValidate': True,
            'username': '15131255089',
            'password': 'ab18d270d7126ea65915cc22c0d',
            'request_form_verifyCode': '',
            'submit': '',
    
        },
        cookies=r1.cookies.get_dict()
    )
    
    all_cookie.update(r2.cookies.get_dict())
    
    # ############### 3. 用户授权 ###############
    r3 = requests.get(
        url='https://passport.lagou.com/grantServiceTicket/grant.html',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        allow_redirects=False,
        cookies=all_cookie
    
    )
    
    all_cookie.update(r3.cookies.get_dict())
    
    # ############### 4. 用户认证 ###############
    r4 = requests.get(
        url=r3.headers['Location'],
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        allow_redirects=False,
        cookies=all_cookie
    )
    
    all_cookie.update(r4.cookies.get_dict())
    
    r5 = requests.get(
        url=r4.headers['Location'],
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        allow_redirects=False,
        cookies=all_cookie
    )
    all_cookie.update(r5.cookies.get_dict())
    r6 = requests.get(
        url=r5.headers['Location'],
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        allow_redirects=False,
        cookies=all_cookie
    )
    
    all_cookie.update(r6.cookies.get_dict())
    r7 = requests.get(
        url=r6.headers['Location'],
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        allow_redirects=False,
        cookies=all_cookie
    )
    
    all_cookie.update(r7.cookies.get_dict())
    
    # ############### 5. 查看个人页面 ###############
    r5 = requests.get(
        url='https://www.lagou.com/resume/myresume.html',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    
        },
        cookies=all_cookie
    )
    print('武沛齐' in r5.text)
    
    # ############### 6. 查看 ###############
    r6 = requests.get(
        url='https://gate.lagou.com/v1/neirong/account/users/0/',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'X-L-REQ-HEADER': "{deviceType:1}",
            'Origin': 'https://account.lagou.com',
            'Host': 'gate.lagou.com',
        },
        cookies=all_cookie
    
    )
    r6_json = r6.json()
    all_cookie.update(r6.cookies.get_dict())
    
    # ############### 7. 修改个人信息 ###############
    r7 = requests.put(
        url='https://gate.lagou.com/v1/neirong/account/users/0/',
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
            'Origin': 'https://account.lagou.com',
            'Host': 'gate.lagou.com',
            'X-Anit-Forge-Code': r6_json['submitCode'],
            'X-Anit-Forge-Token': r6_json['submitToken'],
            'X-L-REQ-HEADER': "{deviceType:1}",
        },
        cookies=all_cookie,
        json={"userName": "wupeiqi888", "sex": "MALE", "portrait": "images/myresume/default_headpic.png",
              "positionName": '...', "introduce": '....'}
    )
    print(r7.text)
    View Code

    防止xss攻击

    from bs4 import BeautifulSoup
    class XSSFilter(object):
        __instance = None
        def __init__(self):        # XSS白名单
            self.valid_tags = {
                "font": ['color', 'size', 'face', 'style'],
                'b': [],
                'div': [],
                "span": [],
                "table": [
                    'border', 'cellspacing', 'cellpadding'
                ],
                'th': [
                    'colspan', 'rowspan'
                ],
                'td': [
                    'colspan', 'rowspan'
                ],
                "a": ['href', 'target', 'name'],
                "img": ['src', 'alt', 'title'],
                'p': ['align'],
                "pre": ['class'],
                "hr": ['class'],
                'strong': []
            }
        def __new__(cls, *args, **kwargs):
            if not cls.__instance:
                obj = object.__new__(cls, *args, **kwargs)
                cls.__instance = obj
            return cls.__instance
        def process(self, content):
            soup = BeautifulSoup(content, 'html.parser')        # 遍历所有HTML标签
            for tag in soup.find_all():        # 判断标签名是否在白名单中
                if tag.name not in self.valid_tags:
                    tag.hidden = True
                    if tag.name not in ['html', 'body']:
                        tag.hidden = True
                        tag.clear()
                    continue                    # 当前标签的所有属性白名单
                attr_rules = self.valid_tags[tag.name]
                keys = list(tag.attrs.keys())
                for key in keys:
                    if key not in attr_rules:
                        del tag[key]
            return soup.decode()                    #这里返回的就是过滤完的内容
    
    content="""
    <p class='c1' id='i1'>
       asdfaa<span style="font-family:NSimSun;" class='c1'>sdf<a>a</a>sdf</span>sdf
    </p>
    <p>
       <strong class='c2' id='i2'>asdf</strong>
       <script>alert(123)</script>
    </p>
    <h2>
       asdf
    </h2>
    """
    
    content = XSSFilter().process(content)
    print('content',content)

    总结:

    1. 如果爬取的网站有反爬措施,请求里模仿浏览器发给服务器端
    2. 如果需要需要携带信息过去的
      1. 去服务器返回的内容里找.如果有将他格式化成字典或其他保存在session
      2. 看到159900098这样格式的一般都是时间戳,但是位数需要自己 观察
      3. 如果服务器返回的内容里没有key,那么去html或者js找相应的数据
      4. 可能下一次的操作需要携带着上一次服务器发过来的key或其他
    3. 状态码:
      1. 3开头的状态码是自动跳转.在自动跳转的时候可能进行cookies认证
      2. 注意Response request 里的set-cookies参数

      

    参考:http://www.cnblogs.com/wupeiqi/articles/6283017.html

    官方文档:http://cn.python-requests.org/zh_CN/latest/user/quickstart.html#id4

  • 相关阅读:
    龟兔赛跑(动态规划)
    Dividing (多重背包 搜索)
    第k大值01背包问题
    FATE(完全背包)
    01背包 和 完全背包 详解
    放苹果(动态规划)
    max Sum(简单动态规划)
    连连看 优先对列 应用2
    尺取法
    Square(强大的剪枝)
  • 原文地址:https://www.cnblogs.com/chenxuming/p/9269347.html
Copyright © 2011-2022 走看看