zoukankan      html  css  js  c++  java
  • 爬虫基本操作、requests和BeautifulSoup

    1. 爬虫基本操作

    例如舆情系统:
      获取汽车之家新闻放到自己数据库里,创建自己的app,发布内容,注明来源,自己创业。

    URL指定内容获取到
        - 发送Http请求:http://www.autohome.com.cn/news/
        - 基于正则表达式获取内容 
    

    Python实现:

    import requests
    from bs4 import BeautifulSoup
    
    response = requests.get('http://www.autohome.com.cn/news/')
    response.text
    
    obj = BeautifulSoup(response.text,...)
    标签对象 = obj.find('a') # 找到匹配成功的第一个标签
    标签对象.find(...)
    
    [标签对象,标签对象,]= obj.find_all('a') # 找到匹配成功的所有标签
    

    示例一:爬取汽车之家新闻

    requests
    	
    	obj = requests.get("url")
    	obj.content
    	obj.encoding = "gbk"
    	obj.text
    	
    	
    	soup = beautifulsoup(obj.text,'html.parser')
    	标签对象 = soup.find(name='xx')
    	[标签对象,标签对象,] = soup.find_all(...)
    	
    	
    	标签对象.text
    	标签对象.attrs
    	标签对象.get(...)
    	
    
    import requests
    from bs4 import BeautifulSoup
    
    response = requests.get('http://www.autohome.com.cn/news/')     # socket发送的是字节类型
    # # print(response.text)    # 字符串,编码设置不对出现乱码
    # print(response.content)     # response.content获取的是字节类型
    response.encoding = 'gbk'
    # print(response.text)        # response.text拿到的是文本信息
    
    # python有个内置解析器html.parser,html页面的<html lang='en'...></html>对象通过html.parser解析出来
    soup = BeautifulSoup(response.text,'html.parser')
    tag = soup.find(id='auto-channel-lazyload-article')
    # h3 = tag.find(name='h3',class_='c1')     # name是标签名。标签名不能直接写,class='c1'直接报错,写成class_='c1',或者写成attrs={'class':'c1'}
    # h3 = tag.find(name='h3',attrs={'class':'c1'})
    h3 = tag.find(name='h3')
    print(h3)
    练习一:获取一个新闻
    response = requests.get('http://www.autohome.com.cn/news/')
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    li_list = soup.find(id='auto-channel-lazyload-article').find_all('li')    # find_all('li')默认为find_all(name='li')
    for li in li_list:
        # print(li.find('h3'))        # 有时候获取到的li.find('h3')为None
        title = li.find('h3')
        if not title:
            continue
        # print(title,type(title))    # <h3>将于第四季度上市 云度π1正式下线</h3> <class 'bs4.element.Tag'>
        summary = li.find('p').text
        # url = li.find('a').attrs['href']    # li.find('a').attrs  # 获取到li的所有属性,是个字典.使用get也可以获取到url
        url = li.find('a').get('href')
        img = li.find('img').get('src')
    
        # # 下载img
        # res= requests.get(img)
        # file_name = '%s.jpg'%(title,)       # 标题当作下载的img文件名不符合规范,需修改
        # with open(file_name,'wb') as f:
        #     f.write(res.content)
    
        print(title.text, summary,url,img)  # 标题:title.text,简介:summary
        print('=============')
    练习二:找到所有新闻,其中包括标题,简介,url,图片

    示例二:python代码登录github

    1. 登录页面发送请求GET,获取csrftoken
    2. 发送POST请求:
      携带用户名、密码、csrftoken发送POST请求
      产生cookie,拿到后下次就不需要登录了
    
    requests
    	
    	obj = requests.get("url")
    	obj.content
    	obj.encoding = "gbk"
    	obj.text
    	obj.cookies.get_dict()
    	
    	
    	requests.get("url",cookies={'k1':"v1"})
    	
    	
    	soup = beatifulsoup(obj.text,'html.parser')
    	标签 = soup.find(name='xx')
    	[标签,] = soup.find_all(...)
    	
    	
    	标签.text
    	标签.attrs
    	标签.get(...)
    
    import requests
    from bs4 import BeautifulSoup
    
    # 获取token
    r1 = requests.get('https://github.com/login')
    s1 = BeautifulSoup(r1.text,'html.parser')
    token = s1.find(name='input',attrs={'name':'authenticity_token'}).get('value')  # github登录页面携带的凭证不是csrf_token,凭证是authenticity_token
    print(token)    # 4WLM4c+ilLUmmhsM8TEFiYXMX5evoTQaIxmhTc5FmUYetTseKP6Upx5jJkGOzjm3kCAg9sMv3ShMnz0UGzuGvA==
    r1_token_dict = r1.cookies.get_dict()
    
    # 将用户名、密码、token以POST请求发送到服务端
    # 测试下发送POST请求时,查看浏览器Network响应头Headers发送请求的内容
    """
    utf8:?
    authenticity_token:ollV+avLm6Fh3ZevegPO7gOH7xUzEBL0NWdA1aOQ1IO3YQspjOHbfnaXJOtVLQ95BtW9GZlaCIYd5M6v7FGUKg==
    login:asdf
    password:asdf
    commit:Sign in
    """
    
    r2 = requests.post(
        'http://github.com/session',    # POST发送的url是从浏览器Network响应头Headers中查看获取到的
        data={
            'utf8':'?',
            'authenticity_token':token,
            # 'login':'用户名',
            'login':'317828332@qq.com',
            'password':'alex3714',
            # 'password':'密码',
            'commit':'Sign in'
        },
        cookies = r1_token_dict
    )
    # print(r2.text)
    r2_cookie_dict = r2.cookies.get_dict()
    print(r1_token_dict)        # 有些网页get请求时有cookies,有些没有
    #---> {'logged_in': 'no', '_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjM5MjE5MSwiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--a5df8578d625ae99c39b34c4163f684a1d8ad568'}
    print(r2_cookie_dict)          # post请求时的cookies
    #---> {'_gh_sess': 'eyJzZXNzaW9uX2lkIjoiZmMwOTdlNGNlY2U2MmZlNGU4MzBkZmQ2NmYwMjQxNDQiLCJsYXN0X3JlYWRfZnJvbV9yZXBsaWNhcyI6MTUwNDAwNjQwNzQwNywiX2NzcmZfdG9rZW4iOiJLYXJ5MXhpNnQ5SWdJQ3FKeDluamtjYnZ2NXNsWXQyQjBhSy9aQ3I2U1FBPSIsImZsYXNoIjp7ImRpc2NhcmQiOltdLCJmbGFzaGVzIjp7ImFuYWx5dGljc19sb2NhdGlvbl9xdWVyeV9zdHJpcCI6InRydWUifX19--db506f001c00ee91aefb55fad7c6cf9965ce3132'}
    
    #整合二个cookies
    cookie_dict = {}
    cookie_dict.update(r1_token_dict)
    cookie_dict.update(r2_cookie_dict)
    
    #再次发送请求时
    r3 = requests.get(
        # url='xxxxxx',           #登录后可以访问github的页面
        url='https://github.com/settings/emails',
        cookies=cookie_dict
    )
    print(r3.text)
    代码实现

    示例三:对抽屉新闻点赞

    # 1.登录,拿到cookie
    # 2.找到标签url,看抽屉页面发送的点赞请求,首先看往哪个url发送请求。
    # 发送的是post请求,发送的url地址:http://dig.chouti.com/login。返回的不是让浏览器直接跳转页面,返回的是字典
    
    import requests
    from bs4 import BeautifulSoup
    # 1.获取cookie
    r0 = requests.get('http://dig.chouti.com/')
    r0_cookie_dict = r0.cookies.get_dict()
    
    # 2.发送用户名、密码、cookie
    r1 = requests.post(
        'http://dig.chouti.com/login',
        data={
            'phone':'8615131255089',
            'password':'woshiniba',
            'oneMonth':1    # 一个月免登录
        },
        cookies=r0_cookie_dict
    )
    r1_cookie_dict = r1.cookies.get_dict()
    print(r1.text)
    #---> {"result":{"code":"8887", "message":"手机号格式不对", "data":""}}    #这是手机不对的情况下打印的内容
    print(r1.cookies.get_dict())
    #---> {'gpsd': 'd3c9d0b3dfff883f4e86f0094cbfd9bc', 'route': '967b7c98a00b517a995a5a62d3abc65e'}
    
    cookie_dict = {}
    cookie_dict.update(r0_cookie_dict)
    cookie_dict.update(r1_cookie_dict)
    
    # cookie_dict={'gpsd':r0_cookie_dict['gpsd']}      # 同上面cookie_dict一样,但不推荐使用
    
    # 点赞
    r2 = requests.post('http://dig.chouti.com/link/vote?linksId=13911006',cookies=cookie_dict)    # 点赞的时候是post请求,linksId=13911006是文章id
    print(r2.text)
    View Code

     

    2. requests模块

    requests模块中提供的方法

    # requests.get()
    # requests.post()
    # requests.put()
    # requests.request('post')
    
    # requests.get(url, params=None, **kwargs)
    # requests.post(url, data=None, json=None, **kwargs)
    # requests.put(url, data=None, **kwargs)
    # requests.head(url, **kwargs)
    # requests.delete(url, **kwargs)
    # requests.patch(url, data=None, **kwargs)
    # requests.options(url, **kwargs)
    #
    # # 以上方法均是在此方法的基础上构建
    # requests.request(method, url, **kwargs)
    调用关系
    # url='xxx',
    # params={'k1':'v1','nid':888},     #GET传参
    # cookies={},
    # headers={},
    # data = {},        # data提供数据
    # json = {}         # json提供数据
    
    
    # requests.get(
    #     url='xxx',
    #     params={'k1':'v1','nid':888},
    #     cookies={},
    #     headers={}
    # )
    # http://www.baidu.com?k1=v1&nid=888
    
    requests.post(
        url='xxx',
        params={'k1':'v1','nid':888},
        cookies={},
        headers={},
        json={}
    )
    
    # 注意:向后台发送去年请求时,注意请求头
    
    # requests.post(url='',data={})   # 默认携带请求头application/x-www-form-urlencoded
    
    requests.post(url='',data={},headers={'content-type':'application/json'})   # 这样写的话django通过request.POST拿不到值,只能通过request.boby中自己拿
    
    requests.post(url='',json={})       # 默认携带请求头headers={'content-type':'application/json'}
    常用参数
    # auth
    def param_auth():
        from requests.auth import HTTPBasicAuth, HTTPDigestAuth     # HTTPBasicAuth基本上路由器都是通过HTTPBasicAuth验证的
        # 简单常用的基本验证规则
        ret = requests.get('https://api.github.com/user', auth=HTTPBasicAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPBasicAuth验证规则
        ret = requests.get('https://api.github.com/user', auth=HTTPDigestAuth('wupeiqi', 'sdfasdfasdf'))     # HTTPDigestAuth验证规则
        # 上面二种规则不会简单的,爬虫反爬不可能那么简单按照这二种规则验证账号密码。
        print(ret.text)
    
        # ret = requests.get('http://192.168.1.1',)
        # auth=HTTPBasicAuth('admin', 'admin'))
        # ret.encoding = 'gbk'
        # print(ret.text)
    
        # ret = requests.get('http://httpbin.org/digest-auth/auth/user/pass', auth=HTTPDigestAuth('user', 'pass'))
        # print(ret)
    
    
    # timeout   超时时间限制
    
    
    # allow_redirects 允许重定向
    # 假设访问http://www.abc.com跳转到http://www.baidu.com
    response = requests.get('http://www.abc.com',allow_redirects=False)
    print(response.text)        # 不允许重定向,则返回的是http://www.abc.com的内容
    
    response = requests.get('http://www.abc.com',allow_redirects=True)
    print(response.text)       # 返回的是http://www.baidu.com的内容
    
    
    # proxies   代理,防止爬网页时,把ip封了,加代理。可以买代理,也可以自己搭代理服务器,自己生成
    
    # stream
    
    # verify    证书,例如12306的证书。知乎证书可带可不带
    requests.get('http://httpbin.org/get',stream=True,cert='xxxx.pem')  # stream=True需要携带证书,stream=False不需要携带证书
    其他参数

    3. BeautifulSoup

    beautifulsoup:把html结构化成对象,通过对象的方式取html内部元素

    #html_doc = 
    #"""
    # <html><head><title>The Dormouse's story</title></head>
    # <body>
    # asdf
    #     <div class="title">
    #         <b>The Dormouse's story总共</b>
    #         <h1>f</h1>
    #     </div>
    # <div class="story">Once upon a time there were three little sisters; and their names were
    #     <a  class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    #     <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    # and they lived at the bottom of a well.</div>
    # ad<br/>sf
    # <p class="story">...</p>
    # </body>
    # </html>
    # """
    #from bs4 import BeautifulSoup
    #soup = BeautifulSoup(html_doc, features="lxml")		# 与BeautifulSoup(html_doc,'html.parser')不同的是使用的解析器不同,lxml性能更好,不过要安装lxml模块,推荐使用
    
    #tag = soup.find(class_='story')
    # print(tag)
    
    # print(tag.name)
    # #---> div
    # # tag.name = 'span' # 设置
    name属性
    # print(tag.attrs)
    # #---> {'class': ['story']}
    # tag.attrs['kkk'] = 'vvv'
    # print(tag.attrs)
    # #---> {'class': ['story'], 'kkk': 'vvv'}
    # del tag.attrs['kkk']
    # print(tag.attrs)
    # #---> {'class': ['story']}
    attrs属性
    # print(tag.children)
    # #---> <list_iterator object at 0x0000000002EA32B0>
    # print(list(tag.children))
    # #---> ['Once upon a time there were three little sisters; and their names were
        ', <a class="sister0" id="link1">Els<span>f</span>ie</a>, ',
        ', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and
        ', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';
    and they lived at the bottom of a well.']
    # for item in tag.children:
    #     print(type(item),item)
    # # ---> <class 'bs4.element.NavigableString'> Once upon a time there were three little sisters; and their names were
    #
    #     # <class 'bs4.element.Tag'> <a class="sister0" id="link1">Els<span>f</span>ie</a>
    #     # <class 'bs4.element.NavigableString'> ,
    #     #
    #     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    #     # <class 'bs4.element.NavigableString'>  and
    #     #
    #     # <class 'bs4.element.Tag'> <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    #     # <class 'bs4.element.NavigableString'> ;
    #     # and they lived at the bottom of a well.
    chidren属性
    # print(tag)
    # # ---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # tag.clear()
    # print(tag)
    # ---> <div class="story"></div>
    clear属性,清空,但保留标签名
    # tag.decompose()
    # print(tag)
    # #---> <None></None>
    decompose,递归的删除所有的标签
    # taga = tag.find(name='a')
    # taga.extract()
    # print(tag)
    extract属性,递归的删除所有的标签,并获取删除的标签
    # print(tag.decode())
    # #---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # print(type(tag.decode()))
    # # ---> <class 'str'>
    # print(tag.decode_contents(),type(tag.decode_contents()))
    # #---> Once upon a time there were three little sisters; and their names were
    # #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    # #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    # #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    # # and they lived at the bottom of a well. <class 'str'>
    decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
    # print(tag.decode())
    # #---> <div class="story">Once upon a time there were three little sisters; and their names were
    #     #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    #     #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    #     #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    #     # and they lived at the bottom of a well.</div>
    # print(type(tag.decode()))
    # # ---> <class 'str'>
    # print(tag.decode_contents(),type(tag.decode_contents()))
    # #---> Once upon a time there were three little sisters; and their names were
    # #     <a class="sister0" id="link1">Els<span>f</span>ie</a>,
    # #     <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    # #     <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    # # and they lived at the bottom of a well. <class 'str'>
    decode 将标签对象转为字符串类型.但decode_contents(不含当前标签)
    # print(type(tag.encode()))
    # # ---> <class 'bytes'>
    # print(tag.encode())
    # #---> b'<div class="story">Once upon a time there were three little sisters; and their names were
        <a class="sister0" id="link1">Els<span>f</span>ie</a>,
        <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
        <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</div>'
    # print(tag.encode_contents(),type(tag.encode_contents()))
    encode,转换为字节(含当前标签);encode_contents(不含当前标签)
    # tag = soup.find('a')
    # print(tag)
    # tag = soup.find(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')    # recursive递归找;text文本内容,很少用
    # tag = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tag)
    find,获取匹配的第一个标签
    # tags = soup.find_all('a')
    # print(tags)
    
    # tags = soup.find_all('a',limit=1)     # limit=1只找一个
    # print(tags)
    
    # tags = soup.find_all(name='a', attrs={'class': 'sister'}, recursive=True, text='Lacie')
    # # tags = soup.find(name='a', class_='sister', recursive=True, text='Lacie')
    # print(tags)
    find_all,获取匹配的所有标签
    # v = soup.find_all(name=['a','div'])       # name=['a','div'] 查找‘a’标签和'div'标签
    
    # print(v)
    
    # v = soup.find_all(class_=['sister0', 'sister'])   # class_=['sister0', 'sister']查找class='sister0'或者class='sister'
    # print(v)
    
    # v = soup.find_all(text=['Tillie'])
    # print(v, type(v[0]))
    
    
    # v = soup.find_all(id=['link1','link2'])
    # print(v)
    
    # v = soup.find_all(href=['link1','link2'])
    # print(v)
    列表
    #import re
    # rep = re.compile('p')
    # rep = re.compile('^p')
    # v = soup.find_all(name=rep)
    # print(v)
    
    # rep = re.compile('sister.*')
    # v = soup.find_all(class_=rep)
    # print(v)
    
    # rep = re.compile('http://www.oldboy.com/static/.*')
    # v = soup.find_all(href=rep)
    # print(v)
    正则
    # def func(tag):
    #     return tag.has_attr('class') and tag.has_attr('id')       # 返回结果为True,就把结果给v = soup.find_all()
    # v = soup.find_all(name=func)      # name=func把标签遍历一遍,每找到标签执行一次函数。
    # print(v)
    方法筛选,不常用
    # tag = soup.find('a')
    # v = tag.get('id')
    # print(v)
    get,获取标签属性
    # tag = soup.find('a')
    # v = tag.has_attr('id')
    # print(v)
    has_attr,检查标签是否具有该属性
    # tag = soup.find('a')
    # v = tag.get_text()
    # print(v)
    get_text,获取标签内部文本内容
    # tag = soup.find('body')
    # v = tag.index(tag.find('div'))
    # print(v)
    # tag = soup.find('body')
    # for i,v in enumerate(tag):
    #     print(i,v)
    index,检查标签在某标签中的索引位置
    is_empty_element,是否是空标签(是否可以是空)或者自闭合标签
    # soup.next             # 找下一个,不管是标签还是文本
    # soup.next_element     # 找下一个标签
    # soup.next_elements
    # soup.next_sibling     # 找兄弟姐妹
    # soup.next_siblings
    
    # tag.previous
    # tag.previous_element
    # tag.previous_elements
    # tag.previous_sibling
    # tag.previous_siblings
    
    # tag.parent
    # tag.parents
    当前的关联标签
    # tag.find_next(...)
    # tag.find_all_next(...)
    # tag.find_next_sibling(...)
    # tag.find_next_siblings(...)
    
    # tag.find_previous(...)
    # tag.find_all_previous(...)
    # tag.find_previous_sibling(...)
    # tag.find_previous_siblings(...)
    
    # tag.find_parent(...)
    # tag.find_parents(...)
    # 参数同find_all
    查找某标签的关联标签
    # soup.select("title")
    #
    # soup.select("p nth-of-type(3)")
    #
    # soup.select("body a")
    #
    # soup.select("html head title")
    #
    # tag = soup.select("span,a")
    #
    # soup.select("head > title")
    #
    # soup.select("p > a")
    #
    # soup.select("p > a:nth-of-type(2)")
    #
    # soup.select("p > #link1")
    #
    # soup.select("body > a")
    #
    # soup.select("#link1 ~ .sister")
    #
    # soup.select("#link1 + .sister")
    #
    # soup.select(".sister")
    #
    # soup.select("[class~=sister]")
    #
    # soup.select("#link1")
    #
    # soup.select("a#link2")
    #
    # soup.select('a[href]')
    #
    # soup.select('a[href="http://example.com/elsie"]')
    #
    # soup.select('a[href^="http://example.com/"]')
    #
    # soup.select('a[href$="tillie"]')
    #
    # soup.select('a[href*=".com/el"]')
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    #     for child in tag.descendants:
    #         if not isinstance(child, Tag):
    #             continue
    #         if not child.has_attr('href'):
    #             continue
    #         yield child
    #
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator)
    # print(type(tags), tags)
    #
    # from bs4.element import Tag
    #
    #
    # def default_candidate_generator(tag):
    #     for child in tag.descendants:
    #         if not isinstance(child, Tag):
    #             continue
    #         if not child.has_attr('href'):
    #             continue
    #         yield child
    #
    #
    # tags = soup.find('body').select("a", _candidate_generator=default_candidate_generator, limit=1)
    # print(type(tags), tags)
    select, select_one, CSS选择器,select查找多个,select_one查找一个,但是参数类型不一样
    # tag = soup.find('span')
    # print(tag.string)          # 获取
    # tag.string = 'new content' # 设置
    # print(soup)
    
    # tag = soup.find('body')
    # print(tag.string)
    # tag.string = 'xxx'            # tag.text不能修改标签内容
    # print(soup)
    
    # tag = soup.find('body')
    # v = tag.stripped_strings  # 递归内部获取所有标签的文本
    # print(v)
    标签的内容
    # tag = soup.find('body')
    # tag.append(soup.find('a'))
    # print(soup)
    # 如果实在想追加当前标签已经存在的,方法如下
    # from bs4.element import Tag
    # obj = Tag(name='i',attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.append(obj)
    # print(soup)
    append在当前标签内部追加一个标签,当当前内部标签有追加的这个标签时,只是把当前标签内部位置被追加的标签移动到最后
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # tag.insert(2, obj)
    # print(soup)
    insert在当前标签内部指定位置插入一个标签
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('body')
    # # tag.insert_before(obj)
    # tag.insert_after(obj)
    # print(soup)
    insert_after, insert_before在当前标签后面或前面插入
    # from bs4.element import Tag
    # obj = Tag(name='i', attrs={'id': 'it'})
    # obj.string = '我是一个新来的'
    # tag = soup.find('div')
    # tag.replace_with(obj)
    # print(soup)
    replace_with 在当前标签替换为指定标签
    # tag = soup.find('div')
    # a = soup.find('a')
    # tag.setup(previous_sibling=a)
    # print(tag.previous_sibling)
    创建标签之间的关系,关系创建完后没什么用,不会改变标签间的位置
    # from bs4.element import Tag
    # obj1 = Tag(name='div', attrs={'id': 'it'})
    # obj1.string = '我是一个新来的'
    #
    # tag = soup.find('a')
    # v = tag.wrap(obj1)
    # print(soup)
    
    # tag = soup.find('a')
    # v = tag.wrap(soup.find('p'))
    # print(soup)
    wrap,将指定标签把当前标签包裹起来
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    unwrap,去掉当前标签,将保留其包裹的标签
    # tag = soup.find('a')
    # v = tag.unwrap()
    # print(soup)
    unwrap,去掉当前标签,将保留其包裹的标签
  • 相关阅读:
    tuple 元组及字典dict
    day 49 css属性补充浮动 属性定位 抽屉作业
    day48 选择器(基本、层级 、属性) css属性
    day47 列表 表单 css初识
    day 46 http和html
    day 45索引
    day 44 练习题讲解 多表查询
    day 40 多表查询 子查询
    day39 表之间的关联关系、 补充 表操作总结 where 、group by、
    day38 数据类型 约束条件
  • 原文地址:https://www.cnblogs.com/xuyaping/p/7754844.html
Copyright © 2011-2022 走看看