zoukankan      html  css  js  c++  java
  • python BeautifulSoup4解析网页

    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.</p>
    <p class="story">...</p></body></html>
    """
    
    soup=BS(html,'html.parser')
    
    for i in soup.find_all('a'):
        print('i.text:',i.text)#注释掉的内容就不打印了  str类型
        print('i.string:',i.string)  #注释掉的内容 都会打印出来,NavigableString对象
    
    
    print('soup.head.contents:',soup.head.contents,type(soup.head.contents))
    print('soup.head.children:',soup.head.children,type(soup.head.children))
    
    print('soup.body.contents:',soup.body.contents)#返回一个子元素的列表
    print('soup.body.children:',soup.body.children)#返回一个子元素的迭代器
    
    for i in soup.body.children:
        print(i)
    
    print('子孙节点 都显示出来')
    for i in soup.body.descendants:
        print(i)
    
    print('soup.body.string:',soup.body.string)
    print('soup.body.strings:',soup.body.strings)
    print('soup.body.stripped_strings:',soup.body.stripped_strings)  #过滤掉所有空格显示
    
    print('去掉空格的body子元素:')
    for i  in soup.body.stripped_strings:
        print(i)
    
    
    print('soup.a.parent:',soup.a.parent)
    print('soup.a.next_sibling:',soup.a.next_sibling)  #注意文本节点、换行
    都可能成为当前节点的上一个或者下一个同级节点
    print('soup.a.previous_sibling:',soup.a.previous_sibling)
    print('soup.a.next_element:',soup.a.next_element)  #下一个元素 不一定同级
    print('soup.a.previous_element:',soup.a.previous_element)
    
    print('打印所有后面的同级节点:
    ')
    for i in soup.a.next_siblings:
        print(i)
    
    print('soup.a.next_element:',list(soup.a.next_elements)[1])
    
    
    print('***********find_all*****')
    
    print(soup.find_all('a'))
    
    print('引入正则表达式:')
    
    import re
    print(soup.find_all(re.compile(r'^title')))  #正则匹配的是 标签的名字
    
    print('列表的方式匹配:')
    print(soup.find_all(['a','b']))
    
    print('函数的方式匹配,类似filter')
    def func(tag):
        if tag.has_attr('class') and re.search(r'^a',tag.name):
            return tag
    
    print(soup.find_all(func))
    
    
    html = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
    and they lived at the bottom of a well.</p>
    <p class="story">...</p></body></html>
    """
    
    soup=BS(html,'html.parser')
    
    print('按属性值查找:')
    print(soup.find_all(id='link1'))
    print(soup.find_all('a',id='link1'))
    
    print(soup.find_all(id='link2',href=re.compile(r'laci')))  #返回的都是列表
    print(soup.find_all(class_='story')) #注意后面加的下划线
    print(soup.find_all(attrs={'class':'sister'}))
    
    print('按元素内容查找text参数:')
    print(soup.find_all(text='Tillie'))
    print(soup.find_all(text=['Tillie','Lacie']))  #返回的都是元素内容
    print(soup.find_all(text=re.compile(r'ormous')))
    
    print('通过内容元素 找到上级元素')
    print(soup.find_all(text=re.compile(r'ormous'))[1].parent.parent)
    
    #限制查找数量
    print('limit:')
    print(soup.find_all('a',limit=2))
    
    print('只在子节点查找:')
    print(soup.body.find_all('a',limit=2,recursive=False))  #只查找子节点 recursive循环的、递归的
    print(soup.body.find_all(class_='story',recursive=False))
  • 相关阅读:
    OpenGL3:先导篇 数据类型
    Linux开发:同步与异步
    前端面试题
    工具
    API和DLL
    CSS了一个浮动导航条
    AJAX背景技术介绍
    2014年8月18日17:02:53
    怎么增加照片的KB大小
    HTML5增加的几个新的标签
  • 原文地址:https://www.cnblogs.com/xiaoxiao075/p/10925489.html
Copyright © 2011-2022 走看看