1.信息的三种类型。’
1.1 XML
2.JSON
3.YAML
三种信息的比较:
2.信息的提取
import requests
r = requests.get("http://python123.io/ws/demo.html") demo = r.text from bs4 import BeautifulSoup soup = BeautifulSoup(demo,"html.parser") # Tag
# print(soup.find_all(tag)) # NO tag for tag in soup.find_all():
string
# string # print(soup.find_all("a").string) # 错误表达 print(soup.find_all("a",str)) # 错误表达 print(soup.find_all(str= "Basic Python")) # 错误表达 print(soup.find_all(string= "Basic Python")) # 正确表达 精确搜索 # ['Basic Python'] import re print(soup.find_all(string=re.compile("python"))) # 正则运算 模糊搜 # ['This is a python demo page', 'The demo python introduces several python courses.']
others
# 以标签属性值进行,标注属性属性检索 print(soup.find_all("p","course")) # print(soup.find_all(id="link1")) print(soup.find_all(id="link")) # [] 只能精确搜索 print(soup.find_all(id=re.compile("link"))) # 引入正则表达式进行搜索 # 是否对子孙节点进行搜索,默认为是 print(soup.find_all("a")) print(soup.find_all("a",recursive=False)) # no 对子孙节点进行搜索 # []