zoukankan      html  css  js  c++  java
  • python 多线程抓取动态数据

    利用多线程动态抓取数据,网上也有不少教程,但发现过于繁杂,就不能精简再精简?!

    不多解释,直接上代码,基本上还是很好懂的。

    #!/usr/bin/env python
    # coding=utf-8
    
    import urllib2
    import re,sys
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import threading
    import time
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    queue = [
        "http://baike.baidu.com/view/8332.htm",
        "http://baike.baidu.com/view/145819.htm",
        "http://baike.baidu.com/view/643415.htm",
        "http://baike.baidu.com/view/157424.htm",
        "http://baike.baidu.com/view/149759.htm",]
    
    crawled_url = set()
    crawled_word = set()
    
    cnt = 0
    
    class BaikeSpider(threading.Thread):
        """
        模拟浏览器打开页面,多线程爬取数据
        """
    
        def __init__(self,name):
            threading.Thread.__init__(self)
            self.name = str(name)
    
            self.browser = webdriver.Chrome()
    # 将抓取数据写入各自的文件 self.fw = open("baike_words_"+self.name+".txt","wb") def run(self): global queue global crawled_url global crawled_word global cnt while queue: url = queue.pop(0) try: self.browser.get(url) # 休眠0.5s,等待数据加载 time.sleep(0.5) links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a") vote = self.browser.find_element_by_class_name("vote-count").text view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text word = self.browser.title.split(u"_")[0] if word in crawled_word or url in crawled_url: continue else: for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href']) <8: continue tmpurl = link["href"] if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n ot in crawled_url: queue.append(tmpurl) crawled_url.add(url) linedata = word+" "+view+" "+vote+" "+url+" " self.fw.write(linedata) except Exception,e: print 'error',e continue cnt += 1 print cnt,self.name,'len',len(queue) def __exit__(self): self.fw.close() if __name__=='__main__': """ 开5个线程 """ for i in range(5): t = BaikeSpider(i) t.start()
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    win7下安装IIS
    C#在处理多线程更新到UI控件的多种方法
    更新DataGridVeiw中的数据到后台数据库中
    ArcGIS Engine App update
    C#中提供的精准测试程序运行时间的类Stopwatch
    ArcMap10 生成随机点
    HDU 2111 Saving HDU
    HDU 1213 How Many Tables
    HDU 2521 反素数
    HDU 1995 汉诺塔V
  • 原文地址:https://www.cnblogs.com/jkmiao/p/5073727.html
Copyright © 2011-2022 走看看