Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
https://item.taobao.com/item.htm?spm=a1z38n.10677092.0.0.482434a6EmUbbW&id=564564604865
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 15 08:53:08 2016
采集化工标准补录项目
@author: Administrator
"""
import requests,bs4
text=open("hb.txt",'w',encoding='utf-8')
webpage="http://www.bzwxw.com/html/2016/1988_0116/9.html"
res=requests.get(webpage)
res.encoding = 'gbk'
requests.codes.ok
#中文显示全是乱码
res.text
#soup1=bs4.BeautifulSoup(res.text,"lxml",from_encoding="gb18030")
soup1=bs4.BeautifulSoup(res.text,"lxml")
title=soup1.select('title')
len(title)
title_content=title[0].getText()
StandardCode=soup1.select('h5')
len(StandardCode)
content_list=[]
for i in StandardCode:
content=i.getText()
content_list.append(content)
for i in content_list:
print ("i:",i)
if "标准编号" in i or "发布部门" in i or "实施日期" in i:
text.write(i)
text.close()