通过lxml的方式去分析数据,将爬到的数据放到file中的html中
代码如下
# 用线程去爬虫
from urllib.request import Request
from urllib.request import urlopen
from time import sleep,ctime
from lxml import etree
import _thread;
ii=0
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
def spilder(page):
global ii;
url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%%E5%%8C%%97%%E4%%BA%%AC&kw=python&sm=0&p=%i"%(page);
req = Request(url=url, headers=headers);
req_timeout = 5;
f = urlopen(req, None, req_timeout);
s = f.read();
s=s.decode("UTF-8");
s=str(s)
selector = etree.HTML(s);
links = selector.xpath('//tr/td[@class="zwmc"]/div/a/@href|//tr/td[@class="zwmc"]/div/a/text()');
f=open("file/%i.html" %page,'w')
for link in links:
f.write("%s<br>"%link);
print(link);
ii+=1;
print(ii)
f.close();
def main():
global ii;
for i in range(1,11):
_thread.start_new_thread(spilder,(i,))
for kk in range(15):
if(ii>9):
break;
else :
sleep(2)
main()
会出错,socket.timeout: timed out 可以通过代理IP来解决