zoukankan      html  css  js  c++  java
  • 爬虫基本操作、requests和BeautifulSoup

    1. 爬虫基本操作

    例如舆情系统:
      获取汽车之家新闻放到自己数据库里,创建自己的app,发布内容,注明来源,自己创业。

    URL指定内容获取到
        - 发送Http请求:http://www.autohome.com.cn/news/
        - 基于正则表达式获取内容 
    

    Python实现:

    import requests
    from bs4 import BeautifulSoup
    
    response = requests.get('http://www.autohome.com.cn/news/')
    response.text
    
    obj = BeautifulSoup(response.text,...)
    标签对象 = obj.find('a') # 找到匹配成功的第一个标签
    标签对象.find(...)
    
    [标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签
    

    示例一:爬取汽车之家新闻

    requests
    	
    	obj = requests.get("url")
    	obj.content
    	obj.encoding = "gbk"
    	obj.text
    	
    	
    	soup = beautifulsoup(obj.text,'html.parser')
    	标签对象 = soup.find(name='xx')
    	[标签对象,标签对象,] = soup.find_all(...)
    	
    	
    	标签对象.text
    	标签对象.attrs
    	标签对象.get(...)
    	
    
    import requests
    from bs4 import BeautifulSoup
    
    response = requests.get('http://www.autohome.com.cn/news/')     # socket发送的是字节类型
    # # print(response.text)    # 字符串,编码设置不对出现乱码
    # print(response.content)     # response.content获取的是字节类型
    response.encoding = 'gbk'
    # print(response.text)        # response.text拿到的是文本信息
    
    # python有个内置解析器html.parser,html页面的<html lang='en'...></html>对象通过html.parser解析出来
    soup = BeautifulSoup(response.text,'html.parser')
    tag = soup.find(id='auto-channel-lazyload-article')
    # h3 = tag.find(name='h3',class_='c1')     # name是标签名。标签名不能直接写,class='c1'直接报错,写成class_='c1',或者写成attrs={'class':'c1'}
    # h3 = tag.find(name='h3',attrs={'class':'c1'})
    h3 = tag.find(name='h3')
    print(h3)
    练习一:获取一个新闻
    response = requests.get('http://www.autohome.com.cn/news/')
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默认为find_all(name='li')
    for li in li_list:
        # print(li.find('h3'))        # 有时候获取到的li.find('h3')为None
        title = li.find('h3')
        if not title:
            continue
        # print(title,type(title))    # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>
        summary = li.find('p').text
        # url = li.find('a').attrs['href']    # li.find('a').attrs  # 获取到li的所有属性,是个字典.使用get也可以获取到url
        url = li.find('a').get('href')
        img = li.find('img').get('src')
    
        # # 下载img
        # res= requests.get(img)
        # file_name = '%s.jpg'%(title,)       # 标题当作下载的img文件名不符合规范,需修改
        # with open(file_name,'wb') as f:
        #     f.write(res.content)
    
        print(title.text, summary,url,img)  # 标题:title.text,简介:summary
        print('=============')
    练习二:找到所有新闻,其中包括标题,简介,url,图片

    示例二:python代码登录github

    1. 登录页面发送请求GET,获取csrftoken
    2. 发送POST请求:
      携带用户名、密码、csrftoken发送POST请求
      产生cookie,拿到后下次就不需要登录了
    
    requests
    	
    	obj = requests.get("url")
    	obj.content
    	obj.encoding = "gbk"
    	obj.text
    	obj.cookies.get_dict()
    	
    	
    	requests.get("url",cookies={'k1':"v1"})
    	
    	
    	soup = beatifulsoup(obj.text,'html.parser')
    	标签 = soup.find(name='xx')
    	[标签,] = soup.find_all(...)
    	
    	
    	标签.text
    	标签.attrs
    	标签.get(...)
    
    import requests
    from bs4 import BeautifulSoup
    
    # 获取token
    r1 = requests.get('https://github.com/login')
    s1 = BeautifulSoup(r1.text,'html.parser')
    token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token
    print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
    r1_token_dict = r1.cookies.get_dict()
    
    # 将用户名、密码、token以POST请求发送到服务端
    # 测试下发送POST请求时,查看浏览器Network响应头Headers发送请求的内容
    """
    utf8:?
    authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
    login:asdf
    password:asdf
    commit:Sign in
    """
    
    r2 = requests.post(
        'http://github.com/session',    # POST发送的url是从浏览器Network响应头Headers中查看获取到的
        data={
            'utf8':'?',
            'authenticity_token':token,
            # 'login':'用户名',
            'login':'317828332@qq.com',
            'password':'alex3714',
            # 'password':'密码',
            'commit':'Sign in'
        },
        cookies = r1_token_dict
    )
    # print(r2.text)
    r2_cookie_dict = r2.cookies.get_dict()
    print(r1_token_dict)        # 有些网页get请求时有cookies,有些没有
    #---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
    print(r2_cookie_dict)          # post请求时的cookies
    #---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}
    
    #整合二个cookies
    cookie_dict = {}
    cookie_dict.update(r1_token_dict)
    cookie_dict.update(r2_cookie_dict)
    
    #再次发送请求时
    r3 = requests.get(
        # url='xxxxxx',           #登录后可以访问github的页面
        url='https://github.com/settings/emails',
        cookies=cookie_dict
    )
    print(r3.text)
    代码实现

    示例三:对抽屉新闻点赞

    # 1.登录,拿到cookie
    # 2.找到标签url,看抽屉页面发送的点赞请求,首先看往哪个url发送请求。
    # 发送的是post请求,发送的url地址:http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面,返回的是字典
    
    import requests
    from bs4 import BeautifulSoup
    # 1.获取cookie
    r0 = requests.get('http://dig.chouti.com/')
    r0_cookie_dict = r0.cookies.get_dict()
    
    # 2.发送用户名、密码、cookie
    r1 = requests.post(
        'http://dig.chouti.com/login',
        data={
            'phone':'8615131255089',
            'password':'woshiniba',
            'oneMonth':1    # 一个月免登录
        },
        cookies=r0_cookie_dict
    )
    r1_cookie_dict = r1.cookies.get_dict()
    print(r1.text)
    #---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}}    #这是手机不对的情况下打印的内容
    print(r1.cookies.get_dict())
    #---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}
    
    cookie_dict = {}
    cookie_dict.update(r0_cookie_dict)
    cookie_dict.update(r1_cookie_dict)
    
    # cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict一样,但不推荐使用
    
    # 点赞
    r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 点赞的时候是post请求,linksId=13911006是文章id
    print(r2.text)
    View Code

     

    2. requests模块

    requests模块中提供的方法

    # requests.get()
    # requests.post()
    # requests.put()
    # requests.request('post')
    
    # requests.get(url, params=None, **kwargs)
    # requests.post(url, data=None, json=None, **kwargs)
    # requests.put(url, data=None, **kwargs)
    # requests.head(url, **kwargs)
    # requests.delete(url, **kwargs)
    # requests.patch(url, data=None, **kwargs)
    # requests.options(url, **kwargs)
    #
    # # 以上方法均是在此方法的基础上构建
    # requests.request(method, url, **kwargs)
    调用关系
    # url='xxx',
    # params={'k1':'v1','nid':888},     #GET传参
    # cookies={},
    # headers={},
    # data = {},        # data提供数据
    # json = {}         # json提供数据
    
    
    # requests.get(
    #     url='xxx',
    #     params={'k1':'v1','nid':888},
    #     cookies={},
    #     headers={}
    # )
    # http://www.baidu.com?k1=v1&nid=888
    
    requests.post(
        url='xxx',
        params={'k1':'v1','nid':888},
        cookies={},
        headers={},
        json={}
    )
    
    # 注意:向后台发送去年请求时,注意请求头
    
    # requests.post(url='',data={})   # 默认携带请求头application/x-www-form-urlencoded
    
    requests.post(url='',data={},headers={'content-type':'application/json'})   # 这样写的话django通过request.POST拿不到值,只能通过request.boby中自己拿
    
    requests.post(url='',json={})       # 默认携带请求头headers={'content-type':'application/json'}
    常用参数
    # auth
    def param_auth():
        from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的
        # 简单常用的基本验证规则
        ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth验证规则
        ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth验证规则
        # 上面二种规则不会简单的,爬虫反爬不可能那么简单按照这二种规则验证账号密码。
        print(ret.text)
    
        # ret = requests.get('http://192.168.1.1',)
        # auth=HTTPBasicAuth('admin', 'admin'))
        # ret.encoding = 'gbk'
        # print(ret.text)
    
        # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
        # print(ret)
    
    
    # timeout   超时时间限制
    
    
    # allow_redirects 允许重定向
    # 假设访问http://www.abc.com跳转到http://www.baidu.com
    response = requests.get('http://www.abc.com',allow_redirects=False)
    print(response.text)        # 不允许重定向,则返回的是http://www.abc.com的内容
    
    response = requests.get('http://www.abc.com',allow_redirects=True)
    print(response.text)       # 返回的是http://www.baidu.com的内容
    
    
    # proxies   代理,防止爬网页时,把ip封了,加代理。可以买代理,也可以自己搭代理服务器,自己生成
    
    # stream
    
    # verify    证书,例如12306的证书。知乎证书可带可不带
    requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True需要携带证书,stream=False不需要携带证书
    其他参数

    3. BeautifulSoup

    beautifulsoup:把html结构化成对象,通过对象的方式取html内部元素

    #html_doc = 
    #"""
    # <html><head><title>The Dormouse's story</title></head>
    # <body>
    # asdf
    #     <div class="title">
    #         <b>The Dormouse's story总共</b>
    #         <h1>f</h1>
    #     </div>
    # <div class="story">Once upon a time there were three little sisters; and their names were
    #     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    #     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    # and they lived at the bottom of a well.</div>
    # ad<br/>sf
    # <p class="story">...</p>
    # </body>
    # </html>
    # """
    #from bs4 import BeautifulSoup
    #soup = BeautifulSoup(html_doc, features="lxml")		# 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同,lxml性能更好,不过要安装lxml模块,推荐使用
    
    #tag = soup.find(class_='story')
    # print(tag)
    
    # print(tag.name)
    # #---> div
    # # tag.name = 'span' # 设置
    name属性
    # print(tag.attrs)
    # #---> {'class': ['story']}
    # tag.attrs['kkk'] = 'vvv'
    # print(tag.attrs)
    # #---> {'class': ['story'], 'kkk': 'vvv'}
    # del tag.attrs['kkk']
    # print(tag.attrs)
    # #---> {'class': ['story']}
    attrs属性
    # print(tag.children)
    # #---> <list_iterator object at 0x0000000002EA32B0>
    # print(list(tag.children))
    # #---> ['Once upon a time there were three little sisters; and their names were
        ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',
        ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and
        ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';
    and they lived at the bottom of a well.']
    # for item in tag.children:
    #     print(type(item),item)
    # # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
    #
    #     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
    #     # <class 'bs4.element.NavigableString'> ,
    #     #
    #     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    #     # <class 'bs4.element.NavigableString'>  and
    #     #
    #     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    #     # <class 'bs4.element.NavigableString'> ;
    #     # and they lived at the bottom of a well.
    chidren属性
    # print(tag)
    # # ---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # tag.clear()
    # print(tag)
    # ---> <div class="story"></div>
    clear属性,清空,但保留标签名
    # tag.decompose()
    # print(tag)
    # #---> <None></None>
    decompose,递归的删除所有的标签
    # taga = tag.find(name='a')
    # taga.extract()
    # print(tag)
    extract属性,递归的删除所有的标签,并获取删除的标签
    # print(tag.decode())
    # #---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # print(type(tag.decode()))
    # # ---> <class 'str'>
    # print(tag.decode_contents(),type(tag.decode_contents()))
    # #---> Once upon a time there were three little sisters; and their names were
    # #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    # #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    # #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    # # and they lived at the bottom of a well. <class 'str'>
    decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
    # print(tag.decode())
    # #---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # print(type(tag.decode()))
    # # ---> <class 'str'>
    # print(tag.decode_contents(),type(tag.decode_contents()))
    # #---> Once upon a time there were three little sisters; and their names were
    # #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    # #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    # #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    # # and they lived at the bottom of a well. <class 'str'>
    decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
    # print(type(tag.encode()))
    # # ---> <class 'bytes'>
    # print(tag.encode())
    # #---> b'<div class="story">Once upon a time there were three little sisters; and their names were
        <a class="sister0" id="link1">Els<span>f</span>ie</a>,
        <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
        <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</div>'
    # print(tag.encode_contents(),type(tag.encode_contents()))
    encode,转换为字节(含当前标签);encode_contents(不含当前标签)
    # tag = soup.find('a')
    # print(tag)
    # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive递归找;text文本内容,很少用
    # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tag)
    find,获取匹配的第一个标签
    # tags = soup.find_all('a')
    # print(tags)
    
    # tags = soup.find_all('a',limit=1)     # limit=1只找一个
    # print(tags)
    
    # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tags)
    find_all,获取匹配的所有标签
    # v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’标签和'div'标签
    
    # print(v)
    
    # v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
    # print(v)
    
    # v = soup.find_all(text=['Tillie'])
    # print(v, type(v[0]))
    
    
    # v = soup.find_all(id=['link1','link2'])
    # print(v)
    
    # v = soup.find_all(href=['link1','link2'])
    # print(v)
    列表
    #import re
    # rep = re.compile('p')
    # rep = re.compile('^p')
    # v = soup.find_all(name=rep)
    # print(v)
    
    # rep = re.compile('sister.*')
    # v = soup.find_all(class_=rep)
    # print(v)
    
    # rep = re.compile('http://www.oldboy.com/static/.*')
    # v = soup.find_all(href=rep)
    # print(v)
    正则
    # def func(tag):
    #     return tag.has_attr('class') and tag.has_attr('id')       # 返回结果为True,就把结果给v = soup.find_all()
    # v = soup.find_all(name=func)      # name=func把标签遍历一遍,每找到标签执行一次函数。
    # print(v)
    方法筛选,不常用
    # tag = soup.find('a')
    # v = tag.get('id')
    # print(v)
    get,获取标签属性
    # tag = soup.find('a')
    # v = tag.has_attr('id')
    # print(v)
    has_attr,检查标签是否具有该属性
    # tag = soup.find('a')
    # v = tag.get_text()
    # print(v)
    get_text,获取标签内部文本内容
    # tag = soup.find('body')
    # v = tag.index(tag.find('div'))
    # print(v)
    # tag = soup.find('body')
    # for i,v in enumerate(tag):
    #     print(i,v)
    index,检查标签在某标签中的索引位置
    is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
    # soup.next             # 找下一个,不管是标签还是文本
    # soup.next_element     # 找下一个标签
    # soup.next_elements
    # soup.next_sibling     # 找兄弟姐妹
    # soup.next_siblings
    
    # tag.previous
    # tag.previous_element
    # tag.previous_elements
    # tag.previous_sibling
    # tag.previous_siblings
    
    # tag.parent
    # tag.parents
    当前的关联标签
    # tag.find_next(...)
    # tag.find_all_next(...)
    # tag.find_next_sibling(...)
    # tag.find_next_siblings(...)
    
    # tag.find_previous(...)
    # tag.find_all_previous(...)
    # tag.find_previous_sibling(...)
    # tag.find_previous_siblings(...)
    
    # tag.find_parent(...)
    # tag.find_parents(...)
    # 参数同find_all
    查找某标签的关联标签
    # soup.select("title")
    #
    # soup.select("p nth-of-type(3)")
    #
    # soup.select("body a")
    #
    # soup.select("html head title")
    #
    # tag = soup.select("span,a")
    #
    # soup.select("head > title")
    #
    # soup.select("p > a")
    #
    # soup.select("p > a:nth-of-type(2)")
    #
    # soup.select("p > #link1")
    #
    # soup.select("body > a")
    #
    # soup.select("#link1 ~ .sister")
    #
    # soup.select("#link1 + .sister")
    #
    # soup.select(".sister")
    #
    # soup.select("[class~=sister]")
    #
    # soup.select("#link1")
    #
    # soup.select("a#link2")
    #
    # soup.select('a[href]')
    #
    # soup.select('a[href="http://example.com/elsie"]')
    #
    # soup.select('a[href^="http://example.com/"]')
    #
    # soup.select('a[href$="tillie"]')
    #
    # soup.select('a[href*=".com/el"]')
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    #     for child in tag.descendants:
    #         if not isinstance(child, Tag):
    #             continue
    #         if not child.has_attr('href'):
    #             continue
    #         yield child
    #
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
    # print(type(tags), tags)
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    #     for child in tag.descendants:
    #         if not isinstance(child, Tag):
    #             continue
    #         if not child.has_attr('href'):
    #             continue
    #         yield child
    #
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
    # print(type(tags), tags)
    select, select_one, CSS选择器,select查找多个,select_one查找一个,但是参数类型不一样
    # tag = soup.find('span')
    # print(tag.string)          # 获取
    # tag.string = 'new content' # 设置
    # print(soup)
    
    # tag = soup.find('body')
    # print(tag.string)
    # tag.string = 'xxx'            # tag.text不能修改标签内容
    # print(soup)
    
    # tag = soup.find('body')
    # v = tag.stripped_strings  # 递归内部获取所有标签的文本
    # print(v)
    标签的内容
    # tag = soup.find('body')
    # tag.append(soup.find('a'))
    # print(soup)
    # 如果实在想追加当前标签已经存在的,方法如下
    # from bs4.element import Tag
    # obj = Tag(name='i',attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.append(obj)
    # print(soup)
    append在当前标签内部追加一个标签,当当前内部标签有追加的这个标签时,只是把当前标签内部位置被追加的标签移动到最后
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert(2, obj)
    # print(soup)
    insert在当前标签内部指定位置插入一个标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # # tag.insert_before(obj)
    # tag.insert_after(obj)
    # print(soup)
    insert_after, insert_before在当前标签后面或前面插入
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('div')
    # tag.replace_with(obj)
    # print(soup)
    replace_with 在当前标签替换为指定标签
    # tag = soup.find('div')
    # a = soup.find('a')
    # tag.setup(previous_sibling=a)
    # print(tag.previous_sibling)
    创建标签之间的关系,关系创建完后没什么用,不会改变标签间的位置
    # from bs4.element import Tag
    # obj1 = Tag(name='div', attrs={'id': 'it'})
    # obj1.string = '我是一个新来的'
    #
    # tag = soup.find('a')
    # v = tag.wrap(obj1)
    # print(soup)
    
    # tag = soup.find('a')
    # v = tag.wrap(soup.find('p'))
    # print(soup)
    wrap,将指定标签把当前标签包裹起来
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    unwrap,去掉当前标签,将保留其包裹的标签
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    unwrap,去掉当前标签,将保留其包裹的标签
  • 相关阅读:
    KKT条件原理
    拉格朗日乘子法
    Java volatile详解
    Java重排序
    Java Socket NIO入门
    Java Socket入门
    TCP三次握手,四次挥手
    Java NIO详解
    cobbler批量安装系统
    nginx详解反向代理,负载均衡,LNMP架构上线动态网站
  • 原文地址:https://www.cnblogs.com/xuyaping/p/7754844.html
Copyright © 2011-2022 走看看