方法一:正则表达式
import re import urllib2 #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用这个 ueser-agent = 'user' headers ={'User-agent':ueser-agent} request = urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) #Regular是需要匹配数据的正则表达式 print(re.findall(Regular,res))
方法二:Beautiful Soup
安装:
pip install beautifulsoup4
import re import urllib2 from bs4 import BeautifulSoup #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用这个 ueser-agent = 'user' headers ={'User-agent':ueser-agent} request = urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) soup = BeautifulSoup(res,'html-parser') fixed_html = soup.prettity() print(fixed_html) #查找标签对应的数据 #tag标签名,如'ul',attr标签属性及属性值组成的键值对,数据类型为字典,如:{'class':'country'} #查找单个标签 tagData = soup.find(tag,attrs=attr) #查找同名的标签集 tagsData = soup.find_all(tag)
方发三:Lxml
安装
pip install lxml
import re import urllib2 import lxml #不需要代理的 res = urllib2.urlopen(url).read() #需要代理使用这个 ueser-agent = 'user' headers ={'User-agent':ueser-agent} request = urllib2.Request(url,headers = headers) opener = urllib2.build_opener() proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_heandler(urllib2.ProxyHandler(proxy_params)) res = opener.open(request).read() print(res) tree = lxml.html.fromstring(res) td = tree.cssselect('tr#place>td.area')[0] area = td.text_content() print(area)