html =""" 2 <!DOCTYPE html> 3 <html> 4 <head> 5 <meta charset = "utf-8"> 6 <title>this is a Demo</title> 7 </head> 8 <body> 9 <div id = "container"> 10 <div class = "wrapper" bit = "id"> 11 <h2 class - "title">Hello World</h2> 12 <p class = "text">Hello,this is a parag raph.</p> 13 </div> 14 </div> 15 <a href = "www.baidu.com">百度</a> 16 </body> 17 </html> 18 """ 19 from bs4 import BeautifulSoup 20 #2实例化BeautifulSoup对象 21 soup = BeautifulSoup(html,"lxml") 22 divs = soup.find_all("div") 23 print(type(divs)) 24 #输出结果为 25 #<class 'bs4.element.ResultSet'> 26 #输出的结果是列表类型,每个div作为列表中的一个元素 27 #bs4提取的结果不是列表,而是通过print解析出了一个列表结果。 28 print(divs) 29 for div in divs: 30 print(div) 31 print("*"*60) 32 #获取指定的标签 33 div1 = soup.find_all("div")[1] 34 print(div1) 35 print("#"*60) 36 #对节点进行切片筛选 37 div_select = list(soup.find_all("div"))[1:2]#选取第二个div 38 for div in div_select: 39 print(div) 40 print("*"*60) 41 #获取指定属性的标签 #方法一 43 divs_attribute = list(soup.find_all("div",id = "container") ) 44 print(divs_attribute) 45 print("*"*60) 46 #方法二,用一个字典将属性 47 divs_attribute = list(soup.find_all("div",attrs = {"id":"co ntainer"})) 48 print(divs_attribute) 49 #获取多个指定属性的标签 50 #如果遇见属性的名字与python的关键字重复则需要在属性名称后加 上下划线eg:class_ 51 52 divs = soup.find_all("div",class_ = "wrapper",bit = "id") 53 print(divs) 54 print("#"*60) 55 #获取标签的属性值 56 a = soup.find_all("a")[0] 57 #方法一:通过下表方式提取 58 href = a["href"] 59 print(href) 60 #方法二:利用attrs参数提取 61 href = a.attrs["href"] 62 print(href) 63 #获取标签文本 64 #方法一 65 inf = a.string 66 print(inf) 67 #方法二,利用strings获取标签下的所有文本 68 inf = list(a.strings) 69 print(inf[0]) 70 """ 71 .stripped_strings可以将内容保存,并清除空格换行等没有意义的 内容 72 """
代码运行结果
<class 'bs4.element.ResultSet'>
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>, <div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>]
<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>
************************************************************
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
************************************************************
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
############################################################
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
************************************************************
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>]
************************************************************
[<div id="container">
<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>
</div>]
[<div bit="id" class="wrapper">
<h2 class="">Hello World</h2>
<p class="text">Hello,this is a paragraph.</p>
</div>]
############################################################
www.baidu.com
www.baidu.com
百度
百度