zoukankan      html  css  js  c++  java
  • 16-多线程爬取糗事百科(python+Tread)

    https://www.cnblogs.com/alamZ/p/7414020.html   课件内容

    #_*_ coding: utf-8 _*_
    '''
    Created on 2018年7月17日
    @author: sss
    function: 利用多线程爬取糗事百科页面
    
    '''
    #使用线程库
    import threading
    #队列
    from queue import Queue
    #解析库
    from lxml import etree
    #json处理
    import json
    import time 
    import requests
    from pickle import FALSE
    
    CRAWL_EXIT = False
    PARSE_EXIT = False 
    
    class TreadCrawl(threading.Thread):
        def __init__(self, threadName, pageQueue, dataQueue):
            #threading.Thread.__init__(self)
            #掉用父类初始化方法
            super(TreadCrawl, self).__init__()
            #线程名
            self.threadName = threadName
            #页码队列
            self.pageQueue = pageQueue
            #数据队列
            self.dataQueue = dataQueue
            #请求报头
            self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
            
        def run(self):
            print('启动' + self.threadName)
            while not CRAWL_EXIT:
                try:
                    #取出一个数字,先进先出
                    #课选参数block,默认值为True
                    #1.如果对列为空,block为True的话,不会结束,会进入阻塞转态,直到队列有新的数据
                    #2.如果队列为空,block为false的话,就弹出一个Queue.empty()异常
                    page = self.pageQueue.get(False)
                    url = "http://www.qiushibaike.com/8hr/page/" + str(page) +"/"
                    content = requests.get(url, headers = self.headers).text
                    time.sleep(1)
                    self.dataQueue.put(content)
                except:
                    pass
            print('结束' + self.threadName)    
    
    class ThreadParse(threading.Thread):
        def __init__(self, threadName, dataQueue, filename, lock):
    #         print('parse')
            super(ThreadParse, self).__init__()
            #线程名
            self.threadName = threadName
            #数据队列
            self.dataQueue = dataQueue
            #保存解析后数据的文件名
            self.filename = filename 
            #锁
            self.lock = lock
        
        def run(self):
            print('启动' + self.threadName)
            while not PARSE_EXIT:
                try:
                    html = self.dataQueue.get(False)
    #                 print('0000000000000000000')
                    self.parse(html)
                except:
                    pass
            print('退出' + self.threadName)
        
        def parse(self, html):
            #解析为HTML DOM
            html = etree.HTML(html)
            
            node_list = html.xpath('//div[contains(@id, "qiushi_tag_")]')
    #         print(node_list)
    #         print('6666666666666')
            items = {}
            for node in node_list:
                #用户名
            #     username = node.xpath('./div/a/h2')[0].text
                username = node.xpath('.//h2')[0].text.replace('
    ','')   #用replace去掉
    
                #图片连接
                image = node.xpath('.//img/@src')#[0]
                #取出标题下的内容
                content = node.xpath('./a/div/span')[0].text.replace('
    ','')
                #点赞
                zan = node.xpath('./div/span/i')[0].text
                #评论
                comment = node.xpath('./div/span/a/i')[0].text
                
                items = {
                    'username' : username,
                    'image' : image,
                    'content' : content,
                    'zan' : zan,
                    'comments' : comment
                    }                  
                
                # with 后面有两个必须执行的操作:__enter__ 和 __exit__
                # 不管里面的操作结果如何,都会执行打开、关闭
                # 打开锁、处理内容、释放锁
                with self.lock:
                    # 写入存储的解析后的数据
                    self.filename.write(json.dumps(items, ensure_ascii = False) + "
    ")
            print('已写入')    
                
    
    def main():
        #页码队列,表示20个页面
        pageQueue = Queue(20)
        #放入1~20的数字,先进先出
        for i in range(1, 5):
            pageQueue.put(i)
            
        #采集的结果(每页的html原码)的数据队列,参数为空表示不限制
        dataQueue = Queue()
        
        #打开文件
        filename = open('./qiushi/duanzi.json', 'a', encoding='utf-8')
    
        #创建锁:
        lock = threading.Lock()
        
        #三个采集的名字
        crawlList = ['采集线程 1号','采集线程 2号','采集线程 3号']
        
        #存储三个采集线程的列表集合
        threadcrawl = []
        for threadName in crawlList:
            thread = TreadCrawl(threadName, pageQueue, dataQueue)
            thread.start()
            threadcrawl.append(thread)
            
        #三个解析线程的名字
        parseList = ['解析线程1号', '解析线程2号', '解析线程3号']
        #存储三个解析线程
        threadparse = []
        for threadName in parseList:
            thread = ThreadParse(threadName, dataQueue, filename, lock)
            thread.start()
            threadparse.append(thread)
        
        #等待pageQueue队列为空,也就是等待之前的操作执行完毕,
        while not pageQueue.empty():
            pass
        
        #如果pageQueue为空,采集线程退出循环
        global CRAWL_EXIT
        CRAWL_EXIT = True
        
        print('pageQueue为空')
        
        for thread in threadcrawl:
            thread.join()
            print(1)
            
        while not dataQueue.empty():
            pass
            
        global PARSE_EXIT
        PARSE_EXIT = True
        
        for thread in threadparse:
            thread.join()
            print('2')
        
        with lock:
            #关闭文件
            filename.close()
        print('谢谢使用!')
    
    if __name__ == "__main__":
        main()
        
        
        
        
        
        
    

      

  • 相关阅读:
    SQL去除重复记录
    FullCalendar应用——整合农历节气和节日
    Dropzone.js实现文件拖拽上传
    HTML5实现文件断点续传
    FullCalendar日历插件说明文档
    网络电影免会员播放器
    百度网盘搜索工具
    HTML5学习
    HTML2 -- 布局格式
    JS10 -- url问号后的数据
  • 原文地址:https://www.cnblogs.com/zhumengdexiaobai/p/9325573.html
Copyright © 2011-2022 走看看