zoukankan      html  css  js  c++  java
  • BeautifulSoup解析库

    html ="""                                                    2 <!DOCTYPE html>
      3 <html>
      4     <head>
      5         <meta charset = "utf-8">
      6         <title>this is a Demo</title>
      7     </head>
      8     <body>
      9             <div id = "container">
     10                 <div class = "wrapper" bit = "id">
     11                     <h2 class - "title">Hello World</h2>
     12                     <p class = "text">Hello,this is a parag    raph.</p>
     13             </div>
     14                 </div>
     15             <a href = "www.baidu.com">百度</a>
     16     </body>
     17 </html>
     18 """
     19 from bs4 import BeautifulSoup
     20 #2实例化BeautifulSoup对象
     21 soup = BeautifulSoup(html,"lxml")
     22 divs = soup.find_all("div")
     23 print(type(divs))
     24 #输出结果为
     25 #<class 'bs4.element.ResultSet'>
     26 #输出的结果是列表类型,每个div作为列表中的一个元素
     27 #bs4提取的结果不是列表,而是通过print解析出了一个列表结果。
     28 print(divs)
     29 for div in divs:
     30     print(div)
     31     print("*"*60)
     32 #获取指定的标签
     33 div1 = soup.find_all("div")[1]
     34 print(div1)
     35 print("#"*60)
     36 #对节点进行切片筛选
     37 div_select = list(soup.find_all("div"))[1:2]#选取第二个div
     38 for div in div_select:
     39     print(div)
     40     print("*"*60)
     41 #获取指定属性的标签
    #方法一
     43 divs_attribute = list(soup.find_all("div",id = "container")    )
     44 print(divs_attribute)
     45 print("*"*60)
     46 #方法二,用一个字典将属性
     47 divs_attribute = list(soup.find_all("div",attrs = {"id":"co    ntainer"}))
     48 print(divs_attribute)
     49 #获取多个指定属性的标签
     50 #如果遇见属性的名字与python的关键字重复则需要在属性名称后加
        上下划线eg:class_
     51 
     52 divs = soup.find_all("div",class_ = "wrapper",bit = "id")
     53 print(divs)
     54 print("#"*60)
     55 #获取标签的属性值
     56 a = soup.find_all("a")[0]
     57 #方法一:通过下表方式提取
     58 href = a["href"]
     59 print(href)
     60 #方法二:利用attrs参数提取
     61 href = a.attrs["href"]
     62 print(href)
     63 #获取标签文本                                              
     64 #方法一
     65 inf = a.string
     66 print(inf)
     67 #方法二,利用strings获取标签下的所有文本
     68 inf = list(a.strings)
     69 print(inf[0])
     70 """
     71 .stripped_strings可以将内容保存,并清除空格换行等没有意义的
        内容
     72 """

    代码运行结果

    <class 'bs4.element.ResultSet'>
    [<div id="container">
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    </div>, <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>]
    <div id="container">
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    </div>
    ************************************************************
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    ************************************************************
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    ############################################################
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    ************************************************************
    [<div id="container">
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    </div>]
    ************************************************************
    [<div id="container">
    <div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>
    </div>]
    [<div bit="id" class="wrapper">
    <h2 class="">Hello World</h2>
    <p class="text">Hello,this is a paragraph.</p>
    </div>]
    ############################################################
    www.baidu.com
    www.baidu.com
    百度
    百度

    笨鸟先飞
  • 相关阅读:
    Conntect Bluetooth devices in iOS.
    Why NSAttributedString import html must be on main thread?
    IOS7 SDK 几宗罪
    How to browse the entire documentation using XCode 5 Documentation and API Reference ?
    High Precision Timers in iOS / OS X
    [UWP]抄抄《CSS 故障艺术》的动画
    [Microsoft Teams]使用连接器接收Azure DevOps的通知
    [WPF 自定义控件]自定义一个“传统”的 Validation.ErrorTemplate
    [WPF 自定义控件]在MenuItem上使用RadioButton
    [WPF 自定义控件]创建包含CheckBox的ListBoxItem
  • 原文地址:https://www.cnblogs.com/zoutingrong/p/13826435.html
Copyright © 2011-2022 走看看