1.利用requests.get(url)获取网页页面的html文件
import requests
import bs4
url = 'http:://news.gzcc.cn/html/xiaoyuanxinwen/
response = requests.get(url)
status_code = response.status_code
content = bs4.BeautifulSoup(response.content.decode("utf-8"), "lxml")
element = content.find_all(id='book')
print(status_code)
print(element)
2.利用BeautifulSoup的HTML解析器,生成结构树
import bs4
exampleFile = open('example.html')
exampleSoup = bs4.BeautifulSoup(exampleFile.read(),'html5lib')
elems = exampleSoup.select('#author')
type(elems)
print (elems[0].getText())
3.找出特定标签的html元素
import requests newsurl='http://news.gzcc.cn/html/xiaoyuanxinwen/' res=requests.get(newsurl) res.encoding='utf-8 print(res.text) from bs4 import BeautifulSoup html_sample=‘’ soup=BeautifulSoup(html_sample,'html.parser') print(soup.text)
4.取得含有特定CSS属性的元素
alink = soup.select('#title')
print alink # [<h1 id="title">Hello World</h1>]
soup = BeautifulSoup(html_sample)
for link in soup.select('.link'):
print link
5.练习:
取出h1标签的文本
soup = BeautifulSoup(html_sample)
header = soup.select('h1')
print(header)# [<h1 id="title">Hello World</h1>]
print header[0]# <h1 id="title">Hello World</h1>
print header[0].text# Hello World
取出a标签的链接
alink = soup.select('a')
print alink
# [<a class="link" href="#">This is link1</a>, <a class="link" href="#link2">This is link2</a>]
for link in alink:
print link
取出所有li标签的所有内容
print(soup.li) print(soup.li.string) print(type(soup.li.string)) #<li><!--内容--></li> #<class 'bs4.element.Comment'>
取出一条新闻的标题、链接、发布时间、来源
print(soup.select('div .news-list-title')[0].text)
print(soup.select('div .news-list-thumb')[0].parent.attrs.get('href'))
print(soup.select('div .news-list-info > span')[0].text)
print(soup.select('div .news-list-info > span')[1].text)