利用多线程动态抓取数据,网上也有不少教程,但发现过于繁杂,就不能精简再精简?!
不多解释,直接上代码,基本上还是很好懂的。
#!/usr/bin/env python # coding=utf-8 import urllib2 import re,sys from bs4 import BeautifulSoup from selenium import webdriver import threading import time reload(sys) sys.setdefaultencoding("utf-8") queue = [ "http://baike.baidu.com/view/8332.htm", "http://baike.baidu.com/view/145819.htm", "http://baike.baidu.com/view/643415.htm", "http://baike.baidu.com/view/157424.htm", "http://baike.baidu.com/view/149759.htm",] crawled_url = set() crawled_word = set() cnt = 0 class BaikeSpider(threading.Thread): """ 模拟浏览器打开页面,多线程爬取数据 """ def __init__(self,name): threading.Thread.__init__(self) self.name = str(name) self.browser = webdriver.Chrome()
# 将抓取数据写入各自的文件 self.fw = open("baike_words_"+self.name+".txt","wb") def run(self): global queue global crawled_url global crawled_word global cnt while queue: url = queue.pop(0) try: self.browser.get(url) # 休眠0.5s,等待数据加载 time.sleep(0.5) links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a") vote = self.browser.find_element_by_class_name("vote-count").text view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text word = self.browser.title.split(u"_")[0] if word in crawled_word or url in crawled_url: continue else: for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href']) <8: continue tmpurl = link["href"] if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n ot in crawled_url: queue.append(tmpurl) crawled_url.add(url) linedata = word+" "+view+" "+vote+" "+url+" " self.fw.write(linedata) except Exception,e: print 'error',e continue cnt += 1 print cnt,self.name,'len',len(queue) def __exit__(self): self.fw.close() if __name__=='__main__': """ 开5个线程 """ for i in range(5): t = BaikeSpider(i) t.start()