BS基础 - 走看看

zoukankan html css js c++ java

BS基础

Beautifulsoup 库详解
# -*- coding:utf8 -*-
# 工程路径：3.3 beautifulsoup库.py
# 工程日期：9/6/2019
# 工程目标：beautifulsoup使用详解
"""
bs支持lxml， HTML 解析， html5解析

"""
#%%
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify()) # 格式化html
print(soup.title.string) # 输出 title中内容

#%% 标签选择器
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(type(soup.title)) # 为bs4的元素tag类型
print(soup.head)
print(type(soup.head))
print(soup.p) # 只返回第一个匹配的p标签

#%% 获取标签的名称
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title.name) # 获取标签的名称
print(soup.p.name) # 获取p标签的名称

#%% 获取标签的属性
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p['name'])
print(soup.p.attrs['name']) # 获取属性

#%% 获取标签内的文本内容 .string
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.string) # 获取标签内的文本内容

#%% 标签的嵌套选择
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string)
print(soup.body.p)
print(soup.body.a['href'])
print(soup.body.a['class'])
print(soup.body.a['id'])

#%% 子节点以及子孙节点的选择
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>

Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
Elsie
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.

...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.body.p.a['href'])
#print(soup.p.contents)
print(type(soup.p.contents))
for i in soup.p.contents:
print(i)

#%% .children 获取子节点迭代器类型，
# 使用循环的方式才能取出内容
html = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>

Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
Elsie
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.

...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children):
print(i, child)

查看全文

相关阅读:
[NOIP模拟赛][贪心]奶牛晒衣服.
BZOJ3750: [POI2015]Piecz
BZOJ2348 [Baltic 2011]Plagiarism
高精度乘法【高乘高
 codevs 1215 迷宫
 变量交换
 a+b问题与圆柱体表面积的计算
 算数表达式的练习
 [bzoj1070][SCOI2007]修车[ 网络流]
[bzoj2502]清理雪道[上下界网络流]

原文地址：https://www.cnblogs.com/binyang/p/10995677.html