BeautifulSoup模块
是一个又灵活又方便的网页解析库,而且处理高效,支持多种解析器,利用它不用编写正则表达式即可方便的实现网页信息的提取。
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <div class="author-name">我的Blog</div> <div class="info">这个人很懒,什么都没有留下。</div> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.prettify()) # 格式化代码 print(soup.title.string) # 取出title
1、标签选择器
- 选择元素
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <div class="author-name">我的Blog</div> <div class="info">这个人很懒,什么都没有留下。</div> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.title) # <title>Blog示例</title> print(type(soup.title)) #<class 'bs4.element.Tag'> print(soup.head) #<head><title>Blog示例</title></head> print(soup.a) ''' <a class="logo" href="#"> <img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/> </a> '''
- 获取名称
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <div class="author-name">我的Blog</div> <div class="info">这个人很懒,什么都没有留下。</div> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.title.name) # title
- 获取属性
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'></p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.p.attrs['name']) # dromouse print(soup.p['name']) #dromouse
- 获取内容
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.p.string) # 人生苦短
- 嵌套
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.head.title.string) # Blog示例
- 子节点和子孙节点
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
<a href="#" class="logo">
<img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
</a>
<p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.contents) # 是一个列表
'''
['\n', <a class="logo" href="#">
<img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
</a>, '\n', <p name="dromouse">人生苦短</p>, '\n']
'''
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
<a href="#" class="logo">
<img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
</a>
<p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.children)
for i, children in enumerate(soup.body.children):
print(i,children)
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
<a href="#" class="logo">
<img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
</a>
<p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.descendants)
for i, children in enumerate(soup.body.descendants):
print(i,children)
- 父节点和祖先节点
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
<a href="#" class="logo">
<img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
</a>
<p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.img.parent)
'''
<a class="logo" href="#">
<img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
</a>
'''
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
<a href="#" class="logo">
<img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
</a>
<p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(list(soup.img.parents)) # 祖先节点
- 兄弟节点
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(list(soup.a.next_siblings)) # 兄弟节点 print(list(soup.a.previous_siblings))
2、标准选择器
find_all(name,attrs,recursive,text,**kwargs)
可根据标签名,属性,内容查找文档
- name
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.find_all('p')) # 是一个列表 print(soup.find_all('p')[0])
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.find_all('a')) # 是一个列表, print(type(soup.find_all('a'))) for i in soup.find_all('a'): print(i.find_all('img'))
- attrs
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.find_all(attrs={"class":'logo'})) print(soup.find_all(attrs={"name":'dromouse'}))
- text
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.find_all(text='人生苦短'))
find返回单个元素,find_all返回所有元素,用法和find_all一样。
3、CSS选择器
通过select()直接传入CSS选择器即可完成选择
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.select('.logo')) print(soup.select('body p'))
获取属性
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.select('body img')[0].attrs['src'])
获取内容
html = ''' <html lang="zh-CN"> <head><title>Blog示例</title></head> <body> <a href="#" class="logo"> <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt=""> </a> <p name='dromouse'>人生苦短</p> <p name='dromouse1'>人生苦短1</p> </body> </html> ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html,'lxml') print(soup.select('body p')[0].get_text())
总结:
- 推荐使用lxml解析库,必要时使用html.parser
- 标签选择筛选功能弱但是速度快
- 建议使用find、find_all查询匹配单个结果或者多个结果
- 如果对CSS选择器比较熟建议使用select()
- 最后记住常用的获取属性和文本的方法。
PyQuery解析库
是一个强大又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难,如果你熟悉jQuery的语法,那么PyQuery就是你的绝佳选择,好多如果如果如果。。。。。
1、PyQuery初始化
- 初始化字符串
html=''' <div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) # pyquery对象 print(doc('li')) # 元素选择
- URL初始化
from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com') print(doc('title'))
- 文件初始化
from pyquery import PyQuery as pq doc = pq(filename='pyquery.html') print(doc('li'))
2、基本CSS选择器
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li'))
3、查找元素
- 子元素
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') # print(type(items)) # print(items) li = items.find('li') # print(li) lis = items.children() # print(lis) print(items.children('.active'))
- 父元素
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') con = items.parent() print(con)
- 兄弟节点
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list .item-0.active') print(items.siblings())
4、遍历
- 单个元素
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.item-0.active') print(items)
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li= doc('li').items() # 遍历的方法是items() print(li) #<generator object PyQuery.items at 0x0305F2D0> 生成器 print(next(li)) for i in li: print(i)
5、获取信息
- 获取属性
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a= doc('.item-0.active a') print(a) print(a.attr('href')) print(a.attr.href)
- 获取文本
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a= doc('.item-0.active a') print(a) print(a.text())
- 获取HTML
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) a= doc('.item-0.active a') print(a) print(a.html())
6、DOM操作
- addClass、removeClass
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') li.removeClass('active') print(li) li.addClass('active') print(li)
- attr 、css
html=''' <div id='container'> <ul class='list'> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> ''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') li.attr('name','link') print(li) li.css('color','red') print(li)
- remove
html=''' <div class='wrap'> Hello,world <p>This is a paragraph</p> </div> ''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() print(wrap.text())