-
Beautiful Soup初了解
# 解析工具Beautiful Soup,借助网页的结构和属性等特性来解析网页(简单的说就是python的一个HTML或XML的解析库)
# Beautiful Soup支持的解析器解析器 使用方法 优势 劣势
Python标准库
BeautifulSoup(markup, " html. parser ")
Python 的内 宜标准库、执行速度适中、文档容错能力强
Python 2.7.3及 Python3.2.2 之前的版本文档容错能力差
lxml HTML解析器
BeautifulSoup(markup,"lxml")
速度快、文档容错能力强
需要安装c语言库
lxmlXML解析器
BeautifulSoup(markup,"xml")
速度快、唯一支持 XML 的解析器
需要安装c语言库
html5lib
BeautifulSoup(markup,"htmlSlib")
最好的容错性、以浏览器的
方式解析文梢、生成 HTML5
格式的文档
速度慢、不依赖外部扩展
1 from bs4 import BeautifulSoup 2 3 soup = BeautifulSoup('<p>Hello</p>', 'lxml') 4 print(soup.p.string) 5 6 7 # 输出: 8 Hello
-
Beautiful Soup基本用法
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print(soup.prettify(), soup.title.string, sep=' ') 17 # 初始化BeautifulSoup时,自动更正了不标准的HTML 18 # prettify()方法可以把要解析的字符串以标准的缩进格式输出 19 # soup.title 可以选出HTML中的title节点,再调用string属性就可以得到里面的文本了 20 21 22 # 输出: 23 <html> 24 <head> 25 <title> 26 The Dormouse's story 27 </title> 28 </head> 29 <body> 30 <p class="title" name="dromouse"> 31 <b> 32 The Dormouse's story 33 </b> 34 </p> 35 <p class="story"> 36 Once upon a time there were three little sisters; and their names were 37 <a class="sister" href="http://example.com/elsie" id="link1"> 38 <!-- Elsie --> 39 </a> 40 , 41 <a class="sister" href="http://example.com/lacie" id="link2"> 42 Lacie 43 </a> 44 and 45 <a class="sister" href="http://example.com/tillie" id="link3"> 46 Tillie 47 </a> 48 ; 49 and they lived at the bottom of a well. 50 </p> 51 <p class="story"> 52 ... 53 </p> 54 </body> 55 </html> 56 57 The Dormouse's story
-
节点选择器
# 选择元素
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.title) # 打印输出title节点的选择结果 18 print(type(soup.title)) # 输出soup.title类型 19 print(soup.title.string) # 输出title节点的内容 20 print(soup.head) # 打印输出head节点的选择结果 21 print(soup.p) # 打印输出p节点的选择结果 22 23 24 # 输出: 25 <title>The Dormouse's story</title> 26 <class 'bs4.element.Tag'> 27 The Dormouse's story 28 <head><title>The Dormouse's story</title></head> 29 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
# 提取信息
# 调用string属性获取文本的值
# 利用那么属性获取节点的名称
# 调用attrs获取所有HTML节点属性1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.title.name) # 选取title节点,然后调用name属性获得节点名称 18 # 输出:title 19 print(soup.title.string) # 调用string属性,获取title节点的文本值 20 # 输出:The Dormouse's story 21 22 print(soup.p.attrs) # 调用attrs,获取p节点的所有属性 23 # 输出:{'class': ['title'], 'name': 'dromouse'} 24 25 print(soup.p.attrs['name']) # 获取name属性 26 # 输出:dromouse 27 print(soup.p['name']) # 获取name属性 28 # 输出:dromouse
# 嵌套选择
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 """ 7 8 soup = BeautifulSoup(html, 'lxml') 9 print(soup.head.title) 10 print(type(soup.head.title)) 11 print(soup.head.title.string) 12 13 # 输出: 14 <title>The Dormouse's story</title> 15 <class 'bs4.element.Tag'> 16 The Dormouse's story
# 关联选择
# 1、子节点和子孙节点
# contents属性得到的结果是直接子节点的列表。1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <!-- Elsie --> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 # 选取节点元素之后,可以调用contents属性获取它的直接子节点 36 print(soup.p.contents) 37 38 # 输出: 39 [' Once upon a time there were three little sisters; and their names were ', <a class="sister" href="http://example.com/elsie" id="link1"> 40 <!-- Elsie --> 41 </a>, ' , ', <a class="sister" href="http://example.com/lacie" id="link2"> 42 Lacie 43 </a>, ' and ', <a class="sister" href="http://example.com/tillie" id="link3"> 44 Tillie 45 </a>, ' ; and they lived at the bottom of a well. '] 46 # 返回结果是一个列表,列表中的元素是所选节点的直接子节点(不包括孙节点)
# children属性,返回结果是生成器类型。与contents属性一样,只是返回结果类型不同。
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print(soup.p.children) # 输出:<list_iterator object at 0x1159b7668> 36 for i, child in enumerate(soup.p.children): 37 print(i, child) 38 39 40 # for 循环的输出结果: 41 0 42 Once upon a time there were three little sisters; and their names were 43 44 1 <a class="sister" href="http://example.com/elsie" id="link1"> 45 <span>Elsie</span> 46 </a> 47 2 48 , 49 50 3 <a class="sister" href="http://example.com/lacie" id="link2"> 51 Lacie 52 </a> 53 4 54 and 55 56 5 <a class="sister" href="http://example.com/tillie" id="link3"> 57 Tillie 58 </a> 59 6 60 ; 61 and they lived at the bottom of a well. 62
# descendants属性会递归查询所有子节点,得到所有子孙节点。
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print(soup.p.descendants) # 输出:<generator object Tag.descendants at 0x1131d0048> 36 for i, child in enumerate(soup.p.descendants): 37 print(i, child) 38 39 40 41 # for 循环输出结果: 42 0 43 Once upon a time there were three little sisters; and their names were 44 45 1 <a class="sister" href="http://example.com/elsie" id="link1"> 46 <span>Elsie</span> 47 </a> 48 2 49 50 3 <span>Elsie</span> 51 4 Elsie 52 5 53 54 6 55 , 56 57 7 <a class="sister" href="http://example.com/lacie" id="link2"> 58 Lacie 59 </a> 60 8 61 Lacie 62 63 9 64 and 65 66 10 <a class="sister" href="http://example.com/tillie" id="link3"> 67 Tillie 68 </a> 69 11 70 Tillie 71 72 12 73 ; 74 and they lived at the bottom of a well. 75
# 2、父节点和祖先节点
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 </p> 17 <p class="story"> 18 ... 19 </p> 20 </body> 21 </html> 22 """ 23 24 soup = BeautifulSoup(html, 'lxml') 25 print(soup.a.parent) 26 27 28 # 输出: 29 <p class="story"> 30 Once upon a time there were three little sisters; and their names were 31 <a class="sister" href="http://example.com/elsie" id="link1"> 32 <span>Elsie</span> 33 </a> 34 </p>
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 </p> 17 <p class="story"> 18 ... 19 </p> 20 </body> 21 </html> 22 """ 23 24 soup = BeautifulSoup(html, 'lxml') 25 print(soup.a.parents, type(soup.a.parents), list(enumerate(soup.a.parents)), sep=' ') 26 27 28 # 输出: 29 <generator object PageElement.parents at 0x11c76e048> 30 31 <class 'generator'> 32 33 [(0, <p class="story"> 34 Once upon a time there were three little sisters; and their names were 35 <a class="sister" href="http://example.com/elsie" id="link1"> 36 <span>Elsie</span> 37 </a> 38 </p>), (1, <body> 39 <p class="story"> 40 Once upon a time there were three little sisters; and their names were 41 <a class="sister" href="http://example.com/elsie" id="link1"> 42 <span>Elsie</span> 43 </a> 44 </p> 45 <p class="story"> 46 ... 47 </p> 48 </body>), (2, <html> 49 <head> 50 <title> 51 The Dormouse's story 52 </title> 53 </head> 54 <body> 55 <p class="story"> 56 Once upon a time there were three little sisters; and their names were 57 <a class="sister" href="http://example.com/elsie" id="link1"> 58 <span>Elsie</span> 59 </a> 60 </p> 61 <p class="story"> 62 ... 63 </p> 64 </body> 65 </html>), (3, <html> 66 <head> 67 <title> 68 The Dormouse's story 69 </title> 70 </head> 71 <body> 72 <p class="story"> 73 Once upon a time there were three little sisters; and their names were 74 <a class="sister" href="http://example.com/elsie" id="link1"> 75 <span>Elsie</span> 76 </a> 77 </p> 78 <p class="story"> 79 ... 80 </p> 81 </body> 82 </html> 83 )]
# 涉及内置函数enumerate()
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。1 # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。 2 3 a = ["恕", "我", "直", "言", "在", "坐", "的", "各", "位", "都", "是", "爱", "学", "习", "的"] 4 print(a) # 输出:['恕', '我', '直', '言', '在', '坐', '的', '各', '位', '都', '是', '爱', '学', '习', '的'] 5 b = enumerate(a) 6 print(enumerate(a)) # 输出:<enumerate object at 0x11a1f8b40> 7 print(list(b)) 8 # [(0, '恕'), (1, '我'), (2, '直'), (3, '言'), (4, '在'), (5, '坐'), (6, '的'), (7, '各'), (8, '位'), (9, '都'), 9 # (10, '是'), (11, '爱'), (12, '学'), (13, '习'), (14, '的')] 10 11 for m, n in enumerate(a): 12 print(m, n) 13 # for 循环 输出: 14 0 恕 15 1 我 16 2 直 17 3 言 18 4 在 19 5 坐 20 6 的 21 7 各 22 8 位 23 9 都 24 10 是 25 11 爱 26 12 学 27 13 习 28 14 的
# 3、兄弟节点
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print( 36 # 获取下一个兄弟元素 37 {'Next Sibling': soup.a.next_sibling}, 38 # 获取上一个兄弟元素 39 {'Previous Sibling': soup.a.previous_sibling}, 40 # 返回后面的兄弟元素 41 {'Next Siblings': list(enumerate(soup.a.next_siblings))}, 42 # 返回前面的兄弟元素 43 {'Previous Siblings': list(enumerate(soup.a.previous_siblings))}, 44 45 sep=' ' 46 ) 47 48 49 # 输出: 50 {'Next Sibling': ' , '} 51 52 {'Previous Sibling': ' Once upon a time there were three little sisters; and their names were '} 53 54 {'Next Siblings': [(0, ' , '), (1, <a class="sister" href="http://example.com/lacie" id="link2"> 55 Lacie 56 </a>), (2, ' and '), (3, <a class="sister" href="http://example.com/tillie" id="link3"> 57 Tillie 58 </a>), (4, ' ; and they lived at the bottom of a well. ')]} 59 60 {'Previous Siblings': [(0, ' Once upon a time there were three little sisters; and their names were ')]}
# 4、提取信息
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <body> 6 <p class="story"> 7 Once upon a time there were three little sisters; and their names were 8 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 9 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 10 </p> 11 </body> 12 </html> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print( 17 'Next Sibling:', 18 19 [soup.a.next_sibling], # 获取上一个兄弟节点 20 # 21 type(soup.a.next_sibling), # 上一个兄弟节点的类型 22 # <class 'bs4.element.NavigableString'> 23 [soup.a.next_sibling.string], # 获取上一个兄弟节点的内容 24 # 25 sep=' ' 26 ) 27 28 print( 29 'Parent:', 30 31 [type(soup.a.parents)], # 获取所有的祖先节点 32 # <class 'generator'> 33 [list(soup.a.parents)[0]], # 获取第一个祖先节点 34 # <p class="story"> 35 Once upon a time there were three little sisters; and their names were 36 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 37 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 38 </p> 39 [list(soup.a.parents)[0].attrs['class']], # 获取第一个祖先节点的"class属性"的值 40 # ['story'] 41 sep=' ' 42 ) 43 44 # 为了输出返回的结果,均以列表形式 45 46 47 # 输出: 48 Next Sibling: 49 [' '] 50 <class 'bs4.element.NavigableString'> 51 [' '] 52 Parent: 53 [<class 'generator'>] 54 [<p class="story"> 55 Once upon a time there were three little sisters; and their names were 56 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 57 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 58 </p>] 59 [['story']]
-
方法选择器
-
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
# 查询所有符合条件的元素1 from bs4 import BeautifulSoup 2 3 html = """ 4 <div> 5 <ul> 6 <li class="item-O"><a href="linkl.html">first item</a></li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-inactive"><a href="link3.html">third item</a></li> 9 <li class="item-1"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a> 11 </ul> 12 </div> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print(soup.find_all(name='li'), 17 type(soup.find_all(name='li')[0]), 18 sep=' ') 19 20 21 # 输出: 22 [<li class="item-O"><a href="linkl.html">first item</a></li>, <li class="item-1"><a href="link2.html">second item</a></li>, <li class="item-inactive"><a href="link3.html">third item</a></li>, <li class="item-1"><a href="link4.html">fourth item</a></li>, <li class="item-0"><a href="link5.html">fifth item</a> 23 </li>] 24 25 <class 'bs4.element.Tag'> 26 27 28 # 返回值是一个列表,列表的元素是名为"li"的节点,每个元素都是bs4.element.Tag类型 29 30 31 # 遍历每个a节点 32 from bs4 import BeautifulSoup 33 34 html = """ 35 <div> 36 <ul> 37 <li class="item-O"><a href="linkl.html">first item</a></li> 38 <li class="item-1"><a href="link2.html">second item</a></li> 39 <li class="item-inactive"><a href="link3.html">third item</a></li> 40 <li class="item-1"><a href="link4.html">fourth item</a></li> 41 <li class="item-0"><a href="link5.html">fifth item</a> 42 </ul> 43 </div> 44 """ 45 46 soup = BeautifulSoup(html, 'lxml') 47 li = soup.find_all(name='li') 48 49 for a in li: 50 print(a.find_all(name='a')) 51 52 # 输出: 53 [<a href="linkl.html">first item</a>] 54 [<a href="link2.html">second item</a>] 55 [<a href="link3.html">third item</a>] 56 [<a href="link4.html">fourth item</a>] 57 [<a href="link5.html">fifth item</a>]
1 from bs4 import BeautifulSoup 2 3 html = """ 4 <div> 5 <ul> 6 <li class="item-O"><a href="linkl.html">first item</a></li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-inactive"><a href="link3.html">third item</a></li> 9 <li class="item-1"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a> 11 </ul> 12 </div> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.find_all(attrs={'class': 'item-0'})) 18 print(soup.find_all(attrs={'href': 'link5.html'})) 19 20 21 # 输出: 22 [<li class="item-0"><a href="link5.html">fifth item</a> 23 </li>] 24 [<a href="link5.html">fifth item</a>] 25 26 # 可以通过attrs参数传入一些属性来进行查询,即通过特定的属性来查询 27 # find_all(attrs={'属性名': '属性值', ......})
1 from bs4 import BeautifulSoup 2 import re 3 4 html = """ 5 <div class="panel"> 6 <div class="panel-body"> 7 <a>Hello, this is a link</a> 8 <a>Hello, this is a link, too</a> 9 <div/> 10 <div/> 11 """ 12 13 soup = BeautifulSoup(html, 'lxml') 14 15 # 正则表达式规则对象 16 regular = re.compile('link') 17 18 # text参数课用来匹配节点的文本,传入的形式可以是字符串,也可以是正则表达式对象 19 print(soup.find_all(text=regular)) 20 21 # 正则匹配输出 22 print(re.findall(regular, html)) 23 24 25 # 输出: 26 ['Hello, this is a link', 'Hello, this is a link, too'] 27 ['link', 'link']
-
find(name=None, attrs={}, recursive=True, text=None, **kwargs)
仅返回与给定条件匹配标记的第一个元素
-
-
CSS选择器
- Beautiful Soup 提供了CSS选择器,调用select()方法即可
- css选择器用法:http://www.w3school.com.cn/cssref/css_selectors.asp
-
select(selector, namespaces=None, limit=None, **kwargs)
1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 print( 24 soup.select('.panel .panel-heading'), 25 26 soup.select('ul li'), 27 28 soup.select('#list-2 .element'), 29 30 type(soup.select('ul')[0]), 31 32 sep=' ' 33 ) 34 35 36 # 输出: 37 [<div class="panel-heading"> 38 <h4>Hello</h4> 39 </div>] 40 41 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] 42 43 [<li class="element">Foo</li>, <li class="element">Bar</li>] 44 45 <class 'bs4.element.Tag'>
1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('ul') 24 print(ul_all) 25 26 for ul in ul_all: 27 print() 28 print( 29 ul['id'], 30 31 ul.select('li'), 32 33 sep=' ' 34 ) 35 36 37 # 输出: 38 [<ul class="list" id="list-1"> 39 <li class="element">Foo</li> 40 <li class="element">Bar</li> 41 <li class="element">Jay</li> 42 </ul>, <ul class="list list-small" id="list-2"> 43 <li class="element">Foo</li> 44 <li class="element">Bar</li> 45 </ul>] 46 47 list-1 48 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] 49 50 list-2 51 [<li class="element">Foo</li>, <li class="element">Bar</li>]
1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('ul') 24 print(ul_all) 25 26 for ul in ul_all: 27 print() 28 print( 29 ul['id'], 30 31 ul.attrs['id'], 32 33 sep=' ' 34 ) 35 36 # 直接传入中括号和属性名 或者 通过attrs属性获取属性值 都可以成功获得属性值 37 38 # 输出: 39 [<ul class="list" id="list-1"> 40 <li class="element">Foo</li> 41 <li class="element">Bar</li> 42 <li class="element">Jay</li> 43 </ul>, <ul class="list list-small" id="list-2"> 44 <li class="element">Foo</li> 45 <li class="element">Bar</li> 46 </ul>] 47 48 list-1 49 list-1 50 51 list-2 52 list-2
1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('li') 24 print(ul_all) 25 26 for li in ul_all: 27 print() 28 print( 29 'get_text()方法获取文本:'+li.get_text(), 30 31 'string属性获取文本:'+li.string, 32 33 sep=' ' 34 ) 35 36 37 # 输出: 38 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] 39 40 get_text()方法获取文本:Foo 41 string属性获取文本:Foo 42 43 get_text()方法获取文本:Bar 44 string属性获取文本:Bar 45 46 get_text()方法获取文本:Jay 47 string属性获取文本:Jay 48 49 get_text()方法获取文本:Foo 50 string属性获取文本:Foo 51 52 get_text()方法获取文本:Bar 53 string属性获取文本:Bar