zoukankan      html  css  js  c++  java
  • BeautifulSoup使用

    import re
    
    from bs4 import BeautifulSoup
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())  # 美化
    print(soup.p.attrs)
    
    """
    节点选择器
    """
    # 选择元素
    print(type(soup.title))  # <class 'bs4.element.Tag'>
    print(soup.title.string)  # The Dormouse's story
    print(soup.head)  # <head><title>The Dormouse's story</title></head>
    print(soup.p)  # <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    
    # 提取信息
    # 获取名称
    print(soup.title.name)  # title
    print(soup.p.name)  # p
    # 获取属性
    print(soup.p.attrs)  # {'class': ['title'], 'name': 'dromouse'}
    print(soup.p.attrs['name'])  # dromouse
    print(soup.p['name'])  # dromouse
    print(soup.p['class'])  # ['title']
    # 获取内容
    print(soup.p.string)  # The Dormouse's story
    
    # 嵌套选择
    print(soup.head.title.string)  # The Dormouse's story
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.</p>
    <p class="story">...</p>
    """
    soup = BeautifulSoup(html, 'lxml')
    # 关联选择
    # (1)子节点和子孙节点
    # contents属性得到的结果是直接子节点的列表
    print('(1)子节点和子孙节点')
    print(soup.p.contents)
    # children属性也可以得到直接子节点
    print(soup.p.children)  # <list_iterator object at 0x7fdf9fa12820>
    for i, child in enumerate(soup.p.children):
        print(i, child)  # 0 <b>The Dormouse's story</b>
    """
    # 获取直接子节点,span是在a里面的
    0 Once upon a time there were three little sisters; and their names were
    
    1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
    2 
    
    3 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    4  and
    
    5 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    6 
    and they lived at the bottom of a well.
    """
    # descendants属性获取所有的子孙节点
    print(soup.p.descendants)  # <generator object Tag.descendants at 0x7fdf9fa9bb30>
    for i, child in enumerate(soup.p.descendants):
        print(i, child)
    
    """
    # a 下的<span>Elsie</span>标签被单独输出
    0 Once upon a time there were three little sisters; and their names were
    
    1 <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
    2 <span>Elsie</span>
    3 Elsie
    4 
    
    5 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
    6 Lacie
    7  and
    
    8 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
    9 Tillie
    10 
    and they lived at the bottom of a well.
    """
    # 父节点和祖父节点
    print(soup.a.parent)  # 直接父节点
    print(soup.a.parents)  # 所有祖父节点
    
    # 兄弟节点
    print(soup.a.next_sibling)  # 上一个兄弟节点
    print(soup.a.previous_sibling)  # 下一个兄弟节点
    print(soup.a.next_siblings)  # 后面的兄弟节点生成器
    print(soup.a.previous_siblings)
    
    # 提取信息
    print(soup.a.string)  # 获取文本
    print(list(soup.a.parent)[1].attrs['class'])  # 获取属性
    
    # print(soup.find_all('p', attrs={'class': 'title'})[0].get_text())
    
    """
    方法选择器
    """
    # find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
    html = '''
    <div class="panel">
    <div class="panel-heading">
    <h4>Hello</h4>
    </div>
    <div class="panel-body">
    <ul class="list" id="list-1" name="elements">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    <li class="element">Jay</li>
    </ul>
    <ul class="list list-small" id="list-2">
    <li class="element">Foo</li>
    <li class="element">Bar</li>
    </ul>
    </div>
    </div>
    '''
    soup = BeautifulSoup(html, 'lxml')
    print(soup.find_all(name='ul')[0])
    print(type(soup.find_all(name='ul')[0]))  # <class 'bs4.element.Tag'>
    # 查询出所有ul节点后,再继续查询其内部的li节点
    for ul in soup.find_all(name='ul'):
        print(ul.find_all(name='li'))
    # [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
    # [<li class="element">Foo</li>, <li class="element">Bar</li>]
    
    # attrs 查询属性
    print(soup.find_all(attrs={'id': 'list-1'}))
    print(soup.find_all(attrs={'name': 'elements'}))
    # 常用属性id, class
    print(soup.find_all(id='list-1'))
    print(soup.find_all(class_='element'))
    
    # text参数进行节点的文本匹配
    print(soup.find_all(text=re.compile('Foo')))
    
    # 其它方式:
    # soup.find()
    # soup.find_parent()
    # soup.find_parents()
    # soup.find_next_sibling()
    # soup.find_next_siblings()
    # soup.find_previous_sibling()
    # soup.find_all_next()
    # soup.find_all_previous()
    # ....
    
    """
    css选择器  
    """
    print(soup.select('.panel .panel-heading'))
    print(soup.select('ul li'))
    print(soup.select('#list-2 .element'))
    print(soup.select_one('ul')['id'])  # list-1
    print(soup.select_one('ul').attrs['id']) # list-1
    print(soup.select_one('li').get_text())  # Foo
    print(soup.select_one('li').string)  # Foo
    
  • 相关阅读:
    Spring中的Bean的配置形式
    使用外部属性文件配置Bean以及Bean的生命周期方法
    运行时找到main方法所在的类
    获取SpringMVC中所有RequestMapping映射URL信息
    RequestBody只能读取一次的问题
    接口标记为@ResponseBody却不进入ResponseBodyAdvice
    springboot打成jar包后无法解压
    Springboot打包执行源码解析
    关于base64的一个小细节
    Liquibase使用入门
  • 原文地址:https://www.cnblogs.com/fly-book/p/15092858.html
Copyright © 2011-2022 走看看