from bs4 import BeautifulSoup as BS # 首先是初始化一个BeautifulSoup的对象 soup = BS(text,'lxml')
示例:
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ #声明BeautifulSoup对象 soup = BeautifulSoup(html_doc, 'lxml') #使用find方法查到第一个p标签 find = soup.find('p') #输出返回值类型 find's return type is <class 'bs4.element.Tag'> print("find's return type is ", type(find)) #输出find获取的值find's content is <p class="title"><b>The Dormouse's story</b></p> print("find's content is", find) #输出标签的名字 find's Tag Name is p print("find's Tag Name is ", find.name) #输出标签的class属性值 find's Attribute(class) is ['title'] print("find's Attribute(class) is ", find['class'])
输出内容:
NavigableString就是标签中的文本内容(不包含标签)。获取方式如下:
print('NavigableString is:', find.string)