-
Beautiful Soup初了解
# 解析工具Beautiful Soup,借助网页的结构和属性等特性来解析网页(简单的说就是python的一个HTML或XML的解析库)
# Beautiful Soup支持的解析器
实例引入:解析器 使用方法 优势 劣势
Python标准库
BeautifulSoup(markup, " html. parser ")
Python 的内 宜标准库、执行速度适中、文档容错能力强
Python 2.7.3及 Python3.2.2 之前的版本文档容错能力差
lxml HTML解析器
BeautifulSoup(markup,"lxml")
速度快、文档容错能力强
需要安装c语言库
lxmlXML解析器
BeautifulSoup(markup,"xml")
速度快、唯一支持 XML 的解析器
需要安装c语言库
html5lib
BeautifulSoup(markup,"htmlSlib")
最好的容错性、以浏览器的
方式解析文梢、生成 HTML5
格式的文档
速度慢、不依赖外部扩展
View Code1 from bs4 import BeautifulSoup 2 3 soup = BeautifulSoup('<p>Hello</p>', 'lxml') 4 print(soup.p.string) 5 6 7 # 输出: 8 Hello
-
Beautiful Soup基本用法
View Code1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print(soup.prettify(), soup.title.string, sep=' ') 17 # 初始化BeautifulSoup时,自动更正了不标准的HTML 18 # prettify()方法可以把要解析的字符串以标准的缩进格式输出 19 # soup.title 可以选出HTML中的title节点,再调用string属性就可以得到里面的文本了 20 21 22 # 输出: 23 <html> 24 <head> 25 <title> 26 The Dormouse's story 27 </title> 28 </head> 29 <body> 30 <p class="title" name="dromouse"> 31 <b> 32 The Dormouse's story 33 </b> 34 </p> 35 <p class="story"> 36 Once upon a time there were three little sisters; and their names were 37 <a class="sister" href="http://example.com/elsie" id="link1"> 38 <!-- Elsie --> 39 </a> 40 , 41 <a class="sister" href="http://example.com/lacie" id="link2"> 42 Lacie 43 </a> 44 and 45 <a class="sister" href="http://example.com/tillie" id="link3"> 46 Tillie 47 </a> 48 ; 49 and they lived at the bottom of a well. 50 </p> 51 <p class="story"> 52 ... 53 </p> 54 </body> 55 </html> 56 57 The Dormouse's story
-
节点选择器
# 选择元素
View Code1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.title) # 打印输出title节点的选择结果 18 print(type(soup.title)) # 输出soup.title类型 19 print(soup.title.string) # 输出title节点的内容 20 print(soup.head) # 打印输出head节点的选择结果 21 print(soup.p) # 打印输出p节点的选择结果 22 23 24 # 输出: 25 <title>The Dormouse's story</title> 26 <class 'bs4.element.Tag'> 27 The Dormouse's story 28 <head><title>The Dormouse's story</title></head> 29 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>
# 提取信息
# 调用string属性获取文本的值
# 利用那么属性获取节点的名称
# 调用attrs获取所有HTML节点属性
View Code1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 <p class="title" name="dromouse"><b>The Dormouse's story</b></p> 7 <p class="story">Once upon a time there were three little sisters; and their names were 8 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, 9 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 10 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 11 and they lived at the bottom of a well.</p> 12 <p class="story">...</p> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.title.name) # 选取title节点,然后调用name属性获得节点名称 18 # 输出:title 19 print(soup.title.string) # 调用string属性,获取title节点的文本值 20 # 输出:The Dormouse's story 21 22 print(soup.p.attrs) # 调用attrs,获取p节点的所有属性 23 # 输出:{'class': ['title'], 'name': 'dromouse'} 24 25 print(soup.p.attrs['name']) # 获取name属性 26 # 输出:dromouse 27 print(soup.p['name']) # 获取name属性 28 # 输出:dromouse
# 嵌套选择
View Code1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html><head><title>The Dormouse's story</title></head> 5 <body> 6 """ 7 8 soup = BeautifulSoup(html, 'lxml') 9 print(soup.head.title) 10 print(type(soup.head.title)) 11 print(soup.head.title.string) 12 13 # 输出: 14 <title>The Dormouse's story</title> 15 <class 'bs4.element.Tag'> 16 The Dormouse's story
# 关联选择
# 1、子节点和子孙节点
# contents属性得到的结果是直接子节点的列表。
直接子节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <!-- Elsie --> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 # 选取节点元素之后,可以调用contents属性获取它的直接子节点 36 print(soup.p.contents) 37 38 # 输出: 39 [' Once upon a time there were three little sisters; and their names were ', <a class="sister" href="http://example.com/elsie" id="link1"> 40 <!-- Elsie --> 41 </a>, ' , ', <a class="sister" href="http://example.com/lacie" id="link2"> 42 Lacie 43 </a>, ' and ', <a class="sister" href="http://example.com/tillie" id="link3"> 44 Tillie 45 </a>, ' ; and they lived at the bottom of a well. '] 46 # 返回结果是一个列表,列表中的元素是所选节点的直接子节点(不包括孙节点)
# children属性,返回结果是生成器类型。与contents属性一样,只是返回结果类型不同。
直接子节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print(soup.p.children) # 输出:<list_iterator object at 0x1159b7668> 36 for i, child in enumerate(soup.p.children): 37 print(i, child) 38 39 40 # for 循环的输出结果: 41 0 42 Once upon a time there were three little sisters; and their names were 43 44 1 <a class="sister" href="http://example.com/elsie" id="link1"> 45 <span>Elsie</span> 46 </a> 47 2 48 , 49 50 3 <a class="sister" href="http://example.com/lacie" id="link2"> 51 Lacie 52 </a> 53 4 54 and 55 56 5 <a class="sister" href="http://example.com/tillie" id="link3"> 57 Tillie 58 </a> 59 6 60 ; 61 and they lived at the bottom of a well. 62
# descendants属性会递归查询所有子节点,得到所有子孙节点。
获取子孙节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print(soup.p.descendants) # 输出:<generator object Tag.descendants at 0x1131d0048> 36 for i, child in enumerate(soup.p.descendants): 37 print(i, child) 38 39 40 41 # for 循环输出结果: 42 0 43 Once upon a time there were three little sisters; and their names were 44 45 1 <a class="sister" href="http://example.com/elsie" id="link1"> 46 <span>Elsie</span> 47 </a> 48 2 49 50 3 <span>Elsie</span> 51 4 Elsie 52 5 53 54 6 55 , 56 57 7 <a class="sister" href="http://example.com/lacie" id="link2"> 58 Lacie 59 </a> 60 8 61 Lacie 62 63 9 64 and 65 66 10 <a class="sister" href="http://example.com/tillie" id="link3"> 67 Tillie 68 </a> 69 11 70 Tillie 71 72 12 73 ; 74 and they lived at the bottom of a well. 75
# 2、父节点和祖先节点
parent获取某个节点的一个父节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 </p> 17 <p class="story"> 18 ... 19 </p> 20 </body> 21 </html> 22 """ 23 24 soup = BeautifulSoup(html, 'lxml') 25 print(soup.a.parent) 26 27 28 # 输出: 29 <p class="story"> 30 Once upon a time there were three little sisters; and their names were 31 <a class="sister" href="http://example.com/elsie" id="link1"> 32 <span>Elsie</span> 33 </a> 34 </p>
parent获取所有祖先节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 </p> 17 <p class="story"> 18 ... 19 </p> 20 </body> 21 </html> 22 """ 23 24 soup = BeautifulSoup(html, 'lxml') 25 print(soup.a.parents, type(soup.a.parents), list(enumerate(soup.a.parents)), sep=' ') 26 27 28 # 输出: 29 <generator object PageElement.parents at 0x11c76e048> 30 31 <class 'generator'> 32 33 [(0, <p class="story"> 34 Once upon a time there were three little sisters; and their names were 35 <a class="sister" href="http://example.com/elsie" id="link1"> 36 <span>Elsie</span> 37 </a> 38 </p>), (1, <body> 39 <p class="story"> 40 Once upon a time there were three little sisters; and their names were 41 <a class="sister" href="http://example.com/elsie" id="link1"> 42 <span>Elsie</span> 43 </a> 44 </p> 45 <p class="story"> 46 ... 47 </p> 48 </body>), (2, <html> 49 <head> 50 <title> 51 The Dormouse's story 52 </title> 53 </head> 54 <body> 55 <p class="story"> 56 Once upon a time there were three little sisters; and their names were 57 <a class="sister" href="http://example.com/elsie" id="link1"> 58 <span>Elsie</span> 59 </a> 60 </p> 61 <p class="story"> 62 ... 63 </p> 64 </body> 65 </html>), (3, <html> 66 <head> 67 <title> 68 The Dormouse's story 69 </title> 70 </head> 71 <body> 72 <p class="story"> 73 Once upon a time there were three little sisters; and their names were 74 <a class="sister" href="http://example.com/elsie" id="link1"> 75 <span>Elsie</span> 76 </a> 77 </p> 78 <p class="story"> 79 ... 80 </p> 81 </body> 82 </html> 83 )]
# 涉及内置函数enumerate()
# enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。
enumerate()内置函数1 # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。 2 3 a = ["恕", "我", "直", "言", "在", "坐", "的", "各", "位", "都", "是", "爱", "学", "习", "的"] 4 print(a) # 输出:['恕', '我', '直', '言', '在', '坐', '的', '各', '位', '都', '是', '爱', '学', '习', '的'] 5 b = enumerate(a) 6 print(enumerate(a)) # 输出:<enumerate object at 0x11a1f8b40> 7 print(list(b)) 8 # [(0, '恕'), (1, '我'), (2, '直'), (3, '言'), (4, '在'), (5, '坐'), (6, '的'), (7, '各'), (8, '位'), (9, '都'), 9 # (10, '是'), (11, '爱'), (12, '学'), (13, '习'), (14, '的')] 10 11 for m, n in enumerate(a): 12 print(m, n) 13 # for 循环 输出: 14 0 恕 15 1 我 16 2 直 17 3 言 18 4 在 19 5 坐 20 6 的 21 7 各 22 8 位 23 9 都 24 10 是 25 11 爱 26 12 学 27 13 习 28 14 的
# 3、兄弟节点
获取同级节点1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <head> 6 <title> 7 The Dormouse's story 8 </title> 9 </head> 10 <body> 11 <p class="story"> 12 Once upon a time there were three little sisters; and their names were 13 <a class="sister" href="http://example.com/elsie" id="link1"> 14 <span>Elsie</span> 15 </a> 16 , 17 <a class="sister" href="http://example.com/lacie" id="link2"> 18 Lacie 19 </a> 20 and 21 <a class="sister" href="http://example.com/tillie" id="link3"> 22 Tillie 23 </a> 24 ; 25 and they lived at the bottom of a well. 26 </p> 27 <p class="story"> 28 ... 29 </p> 30 </body> 31 </html> 32 """ 33 34 soup = BeautifulSoup(html, 'lxml') 35 print( 36 # 获取下一个兄弟元素 37 {'Next Sibling': soup.a.next_sibling}, 38 # 获取上一个兄弟元素 39 {'Previous Sibling': soup.a.previous_sibling}, 40 # 返回后面的兄弟元素 41 {'Next Siblings': list(enumerate(soup.a.next_siblings))}, 42 # 返回前面的兄弟元素 43 {'Previous Siblings': list(enumerate(soup.a.previous_siblings))}, 44 45 sep=' ' 46 ) 47 48 49 # 输出: 50 {'Next Sibling': ' , '} 51 52 {'Previous Sibling': ' Once upon a time there were three little sisters; and their names were '} 53 54 {'Next Siblings': [(0, ' , '), (1, <a class="sister" href="http://example.com/lacie" id="link2"> 55 Lacie 56 </a>), (2, ' and '), (3, <a class="sister" href="http://example.com/tillie" id="link3"> 57 Tillie 58 </a>), (4, ' ; and they lived at the bottom of a well. ')]} 59 60 {'Previous Siblings': [(0, ' Once upon a time there were three little sisters; and their names were ')]}
# 4、提取信息
View Code1 from bs4 import BeautifulSoup 2 3 html = """ 4 <html> 5 <body> 6 <p class="story"> 7 Once upon a time there were three little sisters; and their names were 8 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 9 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 10 </p> 11 </body> 12 </html> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print( 17 'Next Sibling:', 18 19 [soup.a.next_sibling], # 获取上一个兄弟节点 20 # 21 type(soup.a.next_sibling), # 上一个兄弟节点的类型 22 # <class 'bs4.element.NavigableString'> 23 [soup.a.next_sibling.string], # 获取上一个兄弟节点的内容 24 # 25 sep=' ' 26 ) 27 28 print( 29 'Parent:', 30 31 [type(soup.a.parents)], # 获取所有的祖先节点 32 # <class 'generator'> 33 [list(soup.a.parents)[0]], # 获取第一个祖先节点 34 # <p class="story"> 35 Once upon a time there were three little sisters; and their names were 36 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 37 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 38 </p> 39 [list(soup.a.parents)[0].attrs['class']], # 获取第一个祖先节点的"class属性"的值 40 # ['story'] 41 sep=' ' 42 ) 43 44 # 为了输出返回的结果,均以列表形式 45 46 47 # 输出: 48 Next Sibling: 49 [' '] 50 <class 'bs4.element.NavigableString'> 51 [' '] 52 Parent: 53 [<class 'generator'>] 54 [<p class="story"> 55 Once upon a time there were three little sisters; and their names were 56 <a class="sister" href="http://example.com/elsie" id="link1">Bob</a> 57 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 58 </p>] 59 [['story']]
-
方法选择器
-
find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
# 查询所有符合条件的元素
name参数1 from bs4 import BeautifulSoup 2 3 html = """ 4 <div> 5 <ul> 6 <li class="item-O"><a href="linkl.html">first item</a></li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-inactive"><a href="link3.html">third item</a></li> 9 <li class="item-1"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a> 11 </ul> 12 </div> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 print(soup.find_all(name='li'), 17 type(soup.find_all(name='li')[0]), 18 sep=' ') 19 20 21 # 输出: 22 [<li class="item-O"><a href="linkl.html">first item</a></li>, <li class="item-1"><a href="link2.html">second item</a></li>, <li class="item-inactive"><a href="link3.html">third item</a></li>, <li class="item-1"><a href="link4.html">fourth item</a></li>, <li class="item-0"><a href="link5.html">fifth item</a> 23 </li>] 24 25 <class 'bs4.element.Tag'> 26 27 28 # 返回值是一个列表,列表的元素是名为"li"的节点,每个元素都是bs4.element.Tag类型 29 30 31 # 遍历每个a节点 32 from bs4 import BeautifulSoup 33 34 html = """ 35 <div> 36 <ul> 37 <li class="item-O"><a href="linkl.html">first item</a></li> 38 <li class="item-1"><a href="link2.html">second item</a></li> 39 <li class="item-inactive"><a href="link3.html">third item</a></li> 40 <li class="item-1"><a href="link4.html">fourth item</a></li> 41 <li class="item-0"><a href="link5.html">fifth item</a> 42 </ul> 43 </div> 44 """ 45 46 soup = BeautifulSoup(html, 'lxml') 47 li = soup.find_all(name='li') 48 49 for a in li: 50 print(a.find_all(name='a')) 51 52 # 输出: 53 [<a href="linkl.html">first item</a>] 54 [<a href="link2.html">second item</a>] 55 [<a href="link3.html">third item</a>] 56 [<a href="link4.html">fourth item</a>] 57 [<a href="link5.html">fifth item</a>]
attrs参数1 from bs4 import BeautifulSoup 2 3 html = """ 4 <div> 5 <ul> 6 <li class="item-O"><a href="linkl.html">first item</a></li> 7 <li class="item-1"><a href="link2.html">second item</a></li> 8 <li class="item-inactive"><a href="link3.html">third item</a></li> 9 <li class="item-1"><a href="link4.html">fourth item</a></li> 10 <li class="item-0"><a href="link5.html">fifth item</a> 11 </ul> 12 </div> 13 """ 14 15 soup = BeautifulSoup(html, 'lxml') 16 17 print(soup.find_all(attrs={'class': 'item-0'})) 18 print(soup.find_all(attrs={'href': 'link5.html'})) 19 20 21 # 输出: 22 [<li class="item-0"><a href="link5.html">fifth item</a> 23 </li>] 24 [<a href="link5.html">fifth item</a>] 25 26 # 可以通过attrs参数传入一些属性来进行查询,即通过特定的属性来查询 27 # find_all(attrs={'属性名': '属性值', ......})
text参数1 from bs4 import BeautifulSoup 2 import re 3 4 html = """ 5 <div class="panel"> 6 <div class="panel-body"> 7 <a>Hello, this is a link</a> 8 <a>Hello, this is a link, too</a> 9 <div/> 10 <div/> 11 """ 12 13 soup = BeautifulSoup(html, 'lxml') 14 15 # 正则表达式规则对象 16 regular = re.compile('link') 17 18 # text参数课用来匹配节点的文本,传入的形式可以是字符串,也可以是正则表达式对象 19 print(soup.find_all(text=regular)) 20 21 # 正则匹配输出 22 print(re.findall(regular, html)) 23 24 25 # 输出: 26 ['Hello, this is a link', 'Hello, this is a link, too'] 27 ['link', 'link']
-
find(name=None, attrs={}, recursive=True, text=None, **kwargs)
仅返回与给定条件匹配标记的第一个元素
-
-
CSS选择器
- Beautiful Soup 提供了CSS选择器,调用select()方法即可
- css选择器用法:http://www.w3school.com.cn/cssref/css_selectors.asp
-
select(selector, namespaces=None, limit=None, **kwargs)
简单示例1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 print( 24 soup.select('.panel .panel-heading'), 25 26 soup.select('ul li'), 27 28 soup.select('#list-2 .element'), 29 30 type(soup.select('ul')[0]), 31 32 sep=' ' 33 ) 34 35 36 # 输出: 37 [<div class="panel-heading"> 38 <h4>Hello</h4> 39 </div>] 40 41 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] 42 43 [<li class="element">Foo</li>, <li class="element">Bar</li>] 44 45 <class 'bs4.element.Tag'>
嵌套选择1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('ul') 24 print(ul_all) 25 26 for ul in ul_all: 27 print() 28 print( 29 ul['id'], 30 31 ul.select('li'), 32 33 sep=' ' 34 ) 35 36 37 # 输出: 38 [<ul class="list" id="list-1"> 39 <li class="element">Foo</li> 40 <li class="element">Bar</li> 41 <li class="element">Jay</li> 42 </ul>, <ul class="list list-small" id="list-2"> 43 <li class="element">Foo</li> 44 <li class="element">Bar</li> 45 </ul>] 46 47 list-1 48 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] 49 50 list-2 51 [<li class="element">Foo</li>, <li class="element">Bar</li>]
获取属性1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('ul') 24 print(ul_all) 25 26 for ul in ul_all: 27 print() 28 print( 29 ul['id'], 30 31 ul.attrs['id'], 32 33 sep=' ' 34 ) 35 36 # 直接传入中括号和属性名 或者 通过attrs属性获取属性值 都可以成功获得属性值 37 38 # 输出: 39 [<ul class="list" id="list-1"> 40 <li class="element">Foo</li> 41 <li class="element">Bar</li> 42 <li class="element">Jay</li> 43 </ul>, <ul class="list list-small" id="list-2"> 44 <li class="element">Foo</li> 45 <li class="element">Bar</li> 46 </ul>] 47 48 list-1 49 list-1 50 51 list-2 52 list-2
获取文本1 html = ''' 2 <div class="panel"> 3 <div class="panel-heading"> 4 <h4>Hello</h4> 5 </div> 6 <div class="panel-body"> 7 <ul class="list" id="list-1"> 8 <li class="element">Foo</li> 9 <li class="element">Bar</li> 10 <li class="element">Jay</li> 11 </ul> 12 <ul class="list list-small" id="list-2"> 13 <li class="element">Foo</li> 14 <li class="element">Bar</li> 15 </ul> 16 </div> 17 </div> 18 ''' 19 20 from bs4 import BeautifulSoup 21 soup = BeautifulSoup(html, 'lxml') 22 23 ul_all = soup.select('li') 24 print(ul_all) 25 26 for li in ul_all: 27 print() 28 print( 29 'get_text()方法获取文本:'+li.get_text(), 30 31 'string属性获取文本:'+li.string, 32 33 sep=' ' 34 ) 35 36 37 # 输出: 38 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>] 39 40 get_text()方法获取文本:Foo 41 string属性获取文本:Foo 42 43 get_text()方法获取文本:Bar 44 string属性获取文本:Bar 45 46 get_text()方法获取文本:Jay 47 string属性获取文本:Jay 48 49 get_text()方法获取文本:Foo 50 string属性获取文本:Foo 51 52 get_text()方法获取文本:Bar 53 string属性获取文本:Bar