一、BeautifulSoup安装
pip install beautifulsoup4
二、使用示例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story总共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """
soup = BeautifulSoup(html_doc, features="lxml")
1.name标签名称
tag1 = soup.find('a')
print(tag1) #打印第一个a标签内容
name = tag1.name #获取
print(name)
tag1.name = 'span' #设置标签为span
print(soup) #打印内容
2.attr标签属性
tag2 = soup.find('a') attrs = tag2.attrs #获取第一个a标签所有属性值 print(attrs) link1 = soup.find_all('a',attrs={'id':'link1'}) #获取所有a标签中,属性有'id':'link1'的内容 print(link1) tag2.attrs = {'ik':123} #设置attrs值 print(tag2.attrs) tag2.attrs['id'] = 'xxxx' #设置 print(tag2.attrs) tag2.attrs['id'] = 'qqq' #设置 print(tag2.attrs)
3.find与find_all查找区别
#find匹配是第一个标签 tag3 = soup.find('a') print(tag3) #find_al是查找所有标签 tag4 = soup.find_all('a') print(tag4)
4.clear,将标签的所有子标签全部清空(保留标签名)
tag5 = soup.find('body') tag5.clear() print(soup)
5.has_attr,检查标签是否具有该属性
tag6 = soup.find('a') v = tag6.has_attr('id') print(v)
6.get_text,获取标签内部文本内容
tag7 = soup.find('a') v = tag7.get_text('id') print(v)
7.decompose,递归的删除所有的标签
body = soup.find('body') body.decompose() print(soup)
8.extract,递归的删除所有的标签,并获取删除的标签
body = soup.find('body') body.extract() print(soup)
9.decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
body = soup.find('body') # v = body.decode() v = body.decode_contents() print(v)
10.encode,转换为字节(含当前标签);encode_contents(不含当前标签)
body = soup.find('body') # v = body.encode() v = body.encode_contents() print(v)
11.标签的内容
tag8 = soup.find('span') print(tag8.string) #获取内容 print(tag8) tag8.string = 'new content' #设置新内容 print(tag8) tag9 = soup.find('body') v = tag9.stripped_strings # 递归内部获取所有标签的文本 print(v) print(next(v))
12.children,所有子标签
body = soup.find('body') v = body.children
13.children,所有子子孙孙标签
body = soup.find('body') v = body.descendants
多余的可以查找官方文档:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html