-
网页解析库
-
提取网页数据
-
第三方库,使用的时候进行安装:
pip3 install beautifulsoup4
解析库
解析器 | 使用方法 |
---|---|
python标准库 | BeautifulSoup(markup,"html.parser") |
lxml HTML解析器 | BeautifulSoup(markup,"lxml") |
lxml XML解析器 | BeautifulSoup(markup,"xml") |
html5lib | BeautifulSoup(markup,"html5lib") |
基本使用
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)
标签选择器
选择元素
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(soup.link)
print(soup.head)
获取名称
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.name) # title
获取内容
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.link.attrs['rel'])
获取内容
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.string)
嵌套选择
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string) # 嵌套选择
# liudemeng - 博客园
子节点和子孙节点
html = """<div class="postCon"><div class="c_b_p_desc">摘要: 数据提取方法 json 数据交换格式,看起来像python中的(字典)的字符串 使用之前进行导包处理 import json 哪里会有json的数据 浏览器切换到手机版 抓包app json.loads 把json字符串转化为python类型 json.loads(json字符串) json.dum<a href="https://www.cnblogs.com/liudemeng/p/10715075.html" class="c_b_p_desc_readmore">阅读全文</a></div></div>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.div.contents)
html = """<div class="postCon"><div class="c_b_p_desc">摘要: 数据提取方法 json 数据交换格式,看起来像python中的(字典)的字符串 使用之前进行导包处理 import json 哪里会有json的数据 浏览器切换到手机版 抓包app json.loads 把json字符串转化为python类型 json.loads(json字符串) json.dum<a href="https://www.cnblogs.com/liudemeng/p/10715075.html" class="c_b_p_desc_readmore">阅读全文</a></div></div>"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.div.children)
for i, child in enumerate(soup.div.children):
print(i, child)
父节点和祖先节点
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.meta.parent)
兄弟节点
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.meta.next_siblings)
print(soup.meta.previous_siblings)
标准选择器
find_all(name, attrs, recursive,text,**kwargs)
可根据标签名.属性.内容查找文档
name
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('link'))
print(soup.find_all('link')[0])
attrs
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link class='element' type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link id="list-1" title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id':"list-1"}))
print(soup.find_all(class_='element'))
# 以上两种写法都是正确的
text
html ="""
<html lang="zh-cn">
<head>
<meta charset="utf-8" />
<meta content="width=device-width, initial-scale=1" />
<meta name="referrer" content="origin" />
<title>liudemeng - 博客园</title>
<link class='element' type="text/css" rel="stylesheet" href="/bundles/blog-common.css?v=svlZeZMvc3He7PqOD4T7SOuQn0_kIfLBYiy3idqd35Y1"/>
<link id="MainCss" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden.css?v=R6EW1cwbYc7SqZ5y0CMKPNjYaFnIdEGDIwRo4NL-lHw1"/>
<link id="mobile-style" media="only screen and (max- 767px)" type="text/css" rel="stylesheet" href="/skins/summerGarden/bundle-summerGarden-mobile.css?v=0rD5dztz_pRczoCFf4jzWyb4-oTf_yCZ8ttZxIagC2s1"/>
<link title="RSS" type="application/rss+xml" rel="alternate" href="https://www.cnblogs.com/liudemeng/rss"/>
<link id="list-1" title="RSD" type="application/rsd+xml" rel="EditURI" href="https://www.cnblogs.com/liudemeng/rsd.xml"/>
<link type="application/wlwmanifest+xml" rel="wlwmanifest" href="https://www.cnblogs.com/liudemeng/wlwmanifest.xml"/>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text="liudemeng - 博客园"))
# ['liudemeng - 博客园']
find(name, attrs, recursive,text,**kwargs)
find返回单个元素,find_all返回所有的元素
find_parents() 和 find_parent()
find_parents() 返回所有的父节点, find_parent()返回父节点
总结:
-
建议使用lxml解析库, 必要时使用html.parser
-
标签选择器筛选功能弱但是速度快
-
建议使用find(), find_all() 查询匹配单个结果或者多个结果
-
记住常用的获取属性以及文本值的方法