zoukankan html css js c++ java

python 多线程抓取动态数据

利用多线程动态抓取数据，网上也有不少教程，但发现过于繁杂，就不能精简再精简？！

不多解释，直接上代码，基本上还是很好懂的。

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re,sys
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
import time
reload(sys)
sys.setdefaultencoding("utf-8")

queue = [
    "http://baike.baidu.com/view/8332.htm",
    "http://baike.baidu.com/view/145819.htm",
    "http://baike.baidu.com/view/643415.htm",
    "http://baike.baidu.com/view/157424.htm",
    "http://baike.baidu.com/view/149759.htm",]

crawled_url = set()
crawled_word = set()

cnt = 0

class BaikeSpider(threading.Thread):
    """
    模拟浏览器打开页面，多线程爬取数据
    """

    def __init__(self,name):
        threading.Thread.__init__(self)
        self.name = str(name)

        self.browser = webdriver.Chrome()

        # 将抓取数据写入各自的文件
        self.fw = open("baike_words_"+self.name+".txt","wb")

    def run(self):
        global queue
        global crawled_url
        global crawled_word
        global cnt

        while queue:
            url = queue.pop(0)
            
            try:
                self.browser.get(url)
                # 休眠0.5s，等待数据加载
                time.sleep(0.5)
                links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a")
                vote = self.browser.find_element_by_class_name("vote-count").text
                view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text
                word = self.browser.title.split(u"_")[0]

                if word in crawled_word or url in crawled_url:
                    continue                
                else:
                    for link in links:
                        if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])
<8:
                            continue
                        tmpurl = link["href"]
                        if re.search("baike.baidu.com/view/d+|baike.baidu.com/subview/d+(/d+)?",tmpurl) and tmpurl n
ot in crawled_url:
                            queue.append(tmpurl)

                crawled_url.add(url)
                linedata = word+"	"+view+"	"+vote+"	"+url+"
"
                self.fw.write(linedata)

            except Exception,e:
                print 'error',e
                continue

            cnt += 1
            print cnt,self.name,'len',len(queue)


    def __exit__(self):
        self.fw.close()

if __name__=='__main__':
    """
    开５个线程
    """
    for i in range(5):
        t = BaikeSpider(i)
        t.start()

每天一小步，人生一大步！Good luck~

查看全文

相关阅读:
IO复用(较详细)
关于CGI 和 PHP-FPM需要弄清的
 php内核一些常识
 python搭建web服务
 瓶颈分析
 分布式系统
 vmdk多文件合成单文件并导入
 用户登录自动调用修改网络信息脚本
 strace命令用法
 使用Nginx反向代理Docker的Asp.Net Core项目的请求

原文地址：https://www.cnblogs.com/jkmiao/p/5073727.html