zoukankan      html  css  js  c++  java
  • BeautifulSoup解析库的介绍和使用

    ### BeautifulSoup解析库的介绍和使用
    ### 三大选择器:节点选择器,方法选择器,CSS选择器
    ### 使用建议:方法选择器 > CSS选择器 > 节点选择器
    
    
    ## 测试文本
    text = '''
    <html><head><title>there is money</title></head>
    <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a href="https://www.baidu.com/1" class="error" id="l1"><span><!-- 1 --></span></a>,
    <a href="https://www.baidu.com/2" class="error" id="l2"><span>2</span></a> and 
    <a href="https://www.baidu.com/3" class="error" id="l3">3</a>;
    66666666666
    </p>
    <p class='body'>...</p>
    '''

    1. 基本用法

    ## 基本用法
    from bs4 import BeautifulSoup
    
    # 初始化BeautifulSoup对象,选择lxml类型
    soup = BeautifulSoup(text, 'lxml')
    # 以标准的缩进格式输出
    print(soup.prettify())
    # 提取title节点的文本内容
    print(soup.title.string)
    
    '''
    输出内容:
    <html>
     <head>
      <title>
       there is money
      </title>
     </head>
     <body>
      <p class="title" name="dmr">
       <b>
        there is money
       </b>
      </p>
      <p class="money">
       good good study, day day up
       <a class="error" href="https://www.baidu.com/1" id="l1">
        <!-- 1 -->
       </a>
       ,
       <a class="error" href="https://www.baidu.com/2" id="l2">
        2
       </a>
       and
       <a class="error" href="https://www.baidu.com/3" id="l3">
        3
       </a>
       ;
    66666666666
      </p>
      <p class="body">
       ...
      </p>
     </body>
    </html>
    there is money
    '''

    2. 节点选择器

    ### 节点选择器
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    print(type(soup))
    print(soup.title)
    print(type(soup.title))
    print(soup.p)
    print(soup.head)
    
    '''
    输出结果:
    <class 'bs4.BeautifulSoup'>
    <title>there is money</title>
    <class 'bs4.element.Tag'>
    <p class="title" name="dmr"><b>there is money</b></p>
    <head><title>there is money</title></head>
    '''
    
    
    ## 提取信息
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    # 提取title标签的文本内容
    print(soup.title.string)
    # p表情的名称
    print(soup.p.name)
    # p标签的属性,字典格式
    print(soup.p.attrs)
    print(soup.p.attrs.get('name'))
    # attrs可省略,直接以字典的提取方式进行信息提取
    print(soup.p['class'])
    print(soup.p.get('class'))
    print(soup.p.string)
    
    '''
    输出内容:
    there is money
    p
    {'class': ['title'], 'name': 'dmr'}
    dmr
    ['title']
    ['title']
    there is money
    '''
    
    
    ## 嵌套选择,套中套
    
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    print(soup.body.p.string)
    
    '''
    输出内容:
    there is money
    '''
    
    
    ## 关联选择
    ## 子节点和子孙节点
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    # 直接子节点,包含换行符文本内容等;contents获取到一个list, children生成一个迭代器(建议使用)
    print(soup.body.contents)
    print(len(soup.body.contents))
    print(soup.body.children)
    for i, child in enumerate(soup.body.children):
        print(i, child)
    print(soup.body.descendants)
    for j, item in enumerate(soup.body.descendants):
        print(j, item)
    
    '''
    输出结果:
    ['
    ', <p class="title" name="dmr"><b>there is money</b></p>, '
    ', <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>, '
    ', <p class="body">...</p>, '
    ']
    7
    <list_iterator object at 0x0000000002DAD320>
    0 
    
    1 <p class="title" name="dmr"><b>there is money</b></p>
    2 
    
    3 <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    4 
    
    5 <p class="body">...</p>
    6 
    
    <generator object Tag.descendants at 0x0000000002D67E58>
    0 
    
    1 <p class="title" name="dmr"><b>there is money</b></p>
    2 <b>there is money</b>
    3 there is money
    4 
    
    5 <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    6 good good study, day day up
    
    7 <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>
    8 <span><!-- 1 --></span>
    9  1 
    10 ,
    
    11 <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>
    12 <span>2</span>
    13 2
    14  and 
    
    15 <a class="error" href="https://www.baidu.com/3" id="l3">3</a>
    16 3
    17 ;
    66666666666
    
    18 
    
    19 <p class="body">...</p>
    20 ...
    21 
    '''
    
    
    ## 父节点和祖先节点
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    print(soup.a.parent)
    print(soup.a.parents)
    for i, parent in enumerate(soup.a.parents):
        print(i, parent)
    
    '''
    输出结果:
    <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    <generator object PageElement.parents at 0x0000000002D68E58>
    0 <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    1 <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    </body>
    2 <html><head><title>there is money</title></head>
    <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    </body></html>
    3 <html><head><title>there is money</title></head>
    <body>
    <p class="title" name="dmr"><b>there is money</b></p>
    <p class="money">good good study, day day up
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>,
    <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a> and 
    <a class="error" href="https://www.baidu.com/3" id="l3">3</a>;
    66666666666
    </p>
    <p class="body">...</p>
    </body></html>
    '''
    
    
    ## 兄弟节点
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    print('Next sibling: ', soup.a.next_sibling)
    print('Previous sibling: ', soup.a.previous_sibling)
    print('Next siblings: ', soup.a.next_siblings)
    print('Previous siblings: ', soup.a.previous_sibling)
    
    '''
    输出结果:
    Next sibling:  ,
    
    Previous sibling:  good good study, day day up
    
    Next siblings:  <generator object PageElement.next_siblings at 0x0000000002D67E58>
    Previous siblings:  good good study, day day up
    '''

    3. 方法选择器

    ### 方法选择器,较为灵活
    ## find_all方法,查询所有符合条件的,返回一个列表,元素类型为tag
    ## find方法,查询符合条件的第一个元素,返回一个tag类型对象
    ## 同理,find_parents和find_parent
    ## find_next_siblings和find_next_sibling
    ## find_previous_siblings和find_previous_sibling
    ## find_all_next和find_next
    ## find_all_previous和find_previous
    from bs4 import BeautifulSoup
    import re
    
    soup = BeautifulSoup(text, 'lxml')
    # 找到节点名为a的节点,为一个列表
    print(soup.find_all(name='a'))
    print(soup.find_all(name='a')[0])
    # 找到id属性为l1, class属性为error的节点
    print(soup.find_all(attrs={'id': 'l1'}))
    print(soup.find_all(class_='error'))
    # 通过文本关键字来进行匹配文本内容
    print(soup.find_all(text=re.compile('money')))
    
    '''
    输出内容:
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>, <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>, <a class="error" href="https://www.baidu.com/3" id="l3">3</a>]
    <a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>]
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>, <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>, <a class="error" href="https://www.baidu.com/3" id="l3">3</a>]
    ['there is money', 'there is money']
    '''

    4. CSS选择器

    ### CSS选择器,select方法,返回一个列表
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    print(soup.select('p a'))
    print(soup.select('.error'))
    print(soup.select('#l1 span'))
    print(soup.select('a'))
    print(type(soup.select('a')))
    
    '''
    输出内容:
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>, <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>, <a class="error" href="https://www.baidu.com/3" id="l3">3</a>]
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>, <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>, <a class="error" href="https://www.baidu.com/3" id="l3">3</a>]
    [<span><!-- 1 --></span>]
    [<a class="error" href="https://www.baidu.com/1" id="l1"><span><!-- 1 --></span></a>, <a class="error" href="https://www.baidu.com/2" id="l2"><span>2</span></a>, <a class="error" href="https://www.baidu.com/3" id="l3">3</a>]
    <class 'bs4.element.ResultSet'>
    '''
    
    ## 嵌套选择,获取属性,获取文本
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(text, 'lxml')
    # 嵌套选择
    for i in soup.select('a'):
        print(i.select('span'))
    # 获取属性
    print(soup.select('a')[0].attrs)
    print(soup.select('a')[0].get('class'))
    # 获取文本
    print(soup.select('a')[1].string)
    print(soup.select('a')[2].get_text())
    
    '''
    输出结果:
    [<span><!-- 1 --></span>]
    [<span>2</span>]
    []
    {'href': 'https://www.baidu.com/1', 'class': ['error'], 'id': 'l1'}
    ['error']
    2
    3
    '''
  • 相关阅读:
    GDB编辑、搜索源码以及在线帮助
    GDB查看栈信息
    GDB信号处理
    GDB反向调试
    GDB调试多进程程序
    GDB后台调试命令
    GDB non-stop模式
    GDB调试多线程程序
    GDB禁用删除断点
    解决Mac OS下Eclipse、IntelliJ IDEA打开其他窗口默认全屏
  • 原文地址:https://www.cnblogs.com/Caiyundo/p/12507111.html
Copyright © 2011-2022 走看看