zoukankan      html  css  js  c++  java
  • python 多线程抓取动态数据

    利用多线程动态抓取数据,网上也有不少教程,但发现过于繁杂,就不能精简再精简?!

    不多解释,直接上代码,基本上还是很好懂的。

    #!/usr/bin/env python
    # coding=utf-8
    
    import urllib2
    import re,sys
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import threading
    import time
    reload(sys)
    sys.setdefaultencoding("utf-8")
    
    queue = [
        "http://baike.baidu.com/view/8332.htm",
        "http://baike.baidu.com/view/145819.htm",
        "http://baike.baidu.com/view/643415.htm",
        "http://baike.baidu.com/view/157424.htm",
        "http://baike.baidu.com/view/149759.htm",]
    
    crawled_url = set()
    crawled_word = set()
    
    cnt = 0
    
    class BaikeSpider(threading.Thread):
        """
        模拟浏览器打开页面,多线程爬取数据
        """
    
        def __init__(self,name):
            threading.Thread.__init__(self)
            self.name = str(name)
    
            self.browser = webdriver.Chrome()
    # 将抓取数据写入各自的文件 self.fw = open("baike_words_"+self.name+".txt","wb") def run(self): global queue global crawled_url global crawled_word global cnt while queue: url = queue.pop(0) try: self.browser.get(url) # 休眠0.5s,等待数据加载 time.sleep(0.5) links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a") vote = self.browser.find_element_by_class_name("vote-count").text view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text word = self.browser.title.split(u"_")[0] if word in crawled_word or url in crawled_url: continue else: for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href']) <8: continue tmpurl = link["href"] if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n ot in crawled_url: queue.append(tmpurl) crawled_url.add(url) linedata = word+" "+view+" "+vote+" "+url+" " self.fw.write(linedata) except Exception,e: print 'error',e continue cnt += 1 print cnt,self.name,'len',len(queue) def __exit__(self): self.fw.close() if __name__=='__main__': """ 开5个线程 """ for i in range(5): t = BaikeSpider(i) t.start()
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    OpenGL学习笔记2——顶点数组
    OpenGL学习笔记1——第一个程序
    OpenGL学习笔记0——安装库
    SDRAM控制器的Verilog建模之一
    Norflash控制器的Verilog建模之三(測試)
    Norflash控制器的Verilog建模之二(仿真)
    Norflash控制器的Verilog建模之一
    simulink中定义结构体信号线
    MATLAB转C语言(二)
    MATLAB GUI界面设计------“轴”组件配置
  • 原文地址:https://www.cnblogs.com/jkmiao/p/5073727.html
Copyright © 2011-2022 走看看