zoukankan      html  css  js  c++  java
  • BeautifulSoup

    #pip3 install beautifulsoup4
    
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(comment,"html.parser") #html.parser 是一个内置的解析器,BeautifulSoup会根据html.parser把html解析为一个个对象
    comment ="""
        <p id="i1">
            我是中国人
        </p>
        <p >
           <script>alert(123)</script>
        </p>
        <p id="i2">
            <span>我是中国人</span>
        </p>
        <p>
            <br />
        </p>
        <p id="i3">
            <span>我是中国人</span><img src="/static/images/1.jpg" alt="" />
        </p>
    
    """
    
    #pip3 install beautifulsoup4
    
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(comment,"html.parser") #html.parser 是一个内置的解析器,BeautifulSoup会根据html.parser把html解析为一个个对象
    
    # tag = soup.find(name="span")    #找第一个标签
    # print(tag)
    
    # obj = soup.find(attrs={"id":"i2"})  #查找属性 查找第一个
    # print(obj)
    
    # obj = soup.find(name="p",attrs={"id":"i2"})  #并且
    # print(obj)
    
    
    # obj = soup.find_all(name="p") #查找属性 查找所有
    # print(obj)
    
    
    
    #查找所有内容,匹配到的清空内容,不删除标签clear()
    # valid_tag = ["p","img","div"]
    #
    # tags = soup.find_all()
    # for tag in tags:
    #     if tag.name not in valid_tag:
    #         tag.clear()
    # print(soup)
    
    
    #查找所有内容,匹配到的删除标签
    # valid_tag = ["p","img","div"]
    #
    # tags = soup.find_all()
    # for tag in tags:
    #     if tag.name not in valid_tag:
    #         tag.decompose()
    # print(soup)
    
    #取到的soup是对象,转换成字符串
    # print(soup.decode())
    
    
    #限制某个标签的属性,不在的属性从标签中删除
    valid_tag = {
        "p":["class","id"],
        "img":["src"],
        "div":["class"],
    }
    
    tags = soup.find_all()
    for tag in tags:
        if tag.name not in valid_tag:
            tag.decompose()
        if tag.attrs:
            #print(tag.attrs)        #获取所有标签的属性
            for k in list(tag.attrs.keys()):
                if k not in valid_tag[tag.name]:
                    del tag.attrs[k]
    content_str = soup.decode()
    print(content_str)
    过滤演示.py

      

  • 相关阅读:
    [swustoj 1021] Submissions of online judge
    [swustoj 404] 最小代价树
    [swustoj 917] K-lucky-number
    [swustoj 183] 种树
    [LA 3887] Slim Span
    [ahu 1248] NBA Finals
    用js获取当前月份的天数
    WampServer
    jquery checkbox选中、改变状态、change和click事件
    为什么排版引擎解析 CSS 选择器时一定要从右往左解析?
  • 原文地址:https://www.cnblogs.com/golangav/p/7212639.html
Copyright © 2011-2022 走看看