zoukankan      html  css  js  c++  java
  • BeautifulSoup4 库的基本使用

      喜欢我的博客可以加关注,有问题可以提问我。

      1.基本使用(下面的html由于过长就不复制了都复用第一个)

    html="""
    <html>
    <head><title>dsojfeoifjosieofiej</title></head>
        
        <meta http-equiv="content-type" content="text/html;charset=utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=Edge">
        <meta content="always" name="referrer">
        <meta name="theme-color" content="#2932e1">
        <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />
        <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" />
        <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg">
        <link rel="dns-prefetch" href="//s1.bdstatic.com"/>
        <link rel="dns-prefetch" href="//t11.baidu.com"/>
        <link rel="dns-prefetch" href="//t12.baidu.com"/>
        <link rel="dns-prefetch" href="//b1.bdstatic.com"/>
    """
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.prettify())
    print(soup.title.string)

      2.选择元素

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.title)
    print(soup.head)
    print(soup.p)(只输出第一个)

      3.获取名称

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.name)

      4.获取属性

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.attrs['name'])
    print(soup.p['name'])

      5.获取内容

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.string)

      6.嵌套选择

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.head.title.string)

      7.子节点和子孙节点

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.contents)#(子节点)
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.children)
    for i,child in enumerate(soup.p.children):
        print(i,child)#(子节点)
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.p.desccendants)
    for i,child in enumerate(soup.p.desccendants):
        print(i,child)#(子孙节点)

      8.父节点和祖先节点

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.a.parent)#(父节点)
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(list(enumerate(soup.a.parents)))#(祖先节点)

      9.兄弟节点

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(list(enumerate(soup.a.next_siblings)))
    print(list(enumerate(soup.a.previous_siblings)))

      10.标准选择器

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    for ul in soup.find_all('ul'):
        print(ul.find_all('li'))

      10.1加参数

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.find_all(attrs={'id':'list-1'}))
    print(soup.find_all(attrs={'name':'elements'}))
    
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.find_all(id='list-1'))
    print(soup.find_all(class_='elements'))

      10.2text

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.find_all(text='Foo'))#(返回内容)

      10.3 find(返回单个元素就是第一个元素)

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.find('ul'))
    print(type(soup.find('ul')))
    print(soup.find('page'))

      10.4 find_parents() find_parent()(这里和上面的类似就不粘贴代码了)

      10.5 find_next_siblings() find_next_sibling()(这里和上面的类似就不粘贴代码了)

      11. CSS 选择器

    
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    print(soup.select('.panel .panel-heading'))#(选择class 为.panel 下的class 为。panel0heading的标签)
    print(soup.select('ul li'))#(选择标签ul 下的li标签)
    print(soup.select('#list-2 .element'))#(选择id为list-2 下的class为 element标签)
    print(type(soup.select('ul')[0]))
    
    
    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    for ul in soup.select('ul'):
        print(ul.select('ul'))

      11.1 获取属性

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    for ul in soup.select('ul'):
        print(ul['id'])
        print(ul.attrs['id'])

      11.2 获取内容

    from bs4 import BeautifulSoup
    soup=BeautifulSoup(html,'lxml')
    for li in soup.select('li'):
        print(li.get_text())
  • 相关阅读:
    依赖属性
    浅拷贝与深拷贝
    使用Advanced Installer打包工具如何设置是否安装预安装程序包
    WPF布局容器
    找不到UseInMemoryDatabase方法
    从零开始学.net core(一)
    那些年我们改过的规则代码
    办公达人私藏的EXCEL辅助工具,一人抵十人,高效办公就靠它了!
    面试题:整理
    面试: Vue数组的变异方法
  • 原文地址:https://www.cnblogs.com/zll20153246/p/9632756.html
Copyright © 2011-2022 走看看