zoukankan      html  css  js  c++  java
  • 糗事百科_基于队列和多线程

    import threading
    import time
    from queue import Queue
    
    import requests
    from lxml import etree
    
    
    class QiuBaiSpider(object):
        # 1.爬取的的网站,和请求头
        def __init__(self):
            self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
            self.headers = {
                'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
            self.data = 0
            self.url_queue = Queue()
            self.response_queue = Queue()
            self.data_queue = Queue()
            self.count = 0
    
        # 2.爬取网站的url
        def get_url_list(self):
            for i in range(1, 13):
                url = self.base_url.format(i)
                self.url_queue.put(url)
    
        # 3.发送请求
        def send_request(self):
            while True:
                url = self.url_queue.get()
                response = requests.get(url, headers=self.headers)
                self.response_queue.put(response)
                self.url_queue.task_done()
    
        # 4. 解析数据
        def analysis_data(self):
            while True:
                data = self.response_queue.get().content
                self.count += 1
                html_data = etree.HTML(data)
                div_list = html_data.xpath("""//*[@id="content-left"]/div""")
                for i in div_list:
                    text = i.xpath('.//h2/text()')[0]
                    self.data += 1
                    self.data_queue.put(text)
                self.response_queue.task_done()
    
        # 5.存储
        def write_file(self):
            while True:
                data = self.data_queue.get()
                self.data_queue.task_done()
    
        def _start(self):
            th_list = []
            # 获取url
            th_url = threading.Thread(target=self.get_url_list)
            th_list.append(th_url)
    
            # 发起请求
            for i in range(2):
                th_send = threading.Thread(target=self.send_request)
                th_list.append(th_send)
    
            # 解析数据
            th_analysis = threading.Thread(target=self.analysis_data)
            th_list.append(th_analysis)
    
            th_save = threading.Thread(target=self.write_file)
            th_list.append(th_save)
            print(th_list)
            # 开启线程保护,和开启线程
            for th in th_list:
                th.setDaemon(True)
                th.start()
            # 开启队列阻塞
            for q in [self.url_queue, self.response_queue, self.data_queue]:
                q.join()
    
        def run(self):
            start = time.time()
            self._start()
            end = time.time()
            print(end - start, "结束时间")
            print(self.data)
    
    
    if __name__ == '__main__':
        qiu_bai = QiuBaiSpider()
        qiu_bai.run()
  • 相关阅读:
    vSan中见证组件witness详解
    zabbix监控企业esxi虚拟机
    新特性之MAPI over HTTP 配置 MAPI over HTTP
    Exchange Server 产品路线图 及 补丁下载
    人生的第一桶金
    这不是我想要的生活,努力才是王道!
    孤独的灵魂该去何处安家
    如何查看myeclipse是否激活
    Visual Studio 2013如何破解(密钥激活)
    unity破解步骤
  • 原文地址:https://www.cnblogs.com/ls1997/p/11276819.html
Copyright © 2011-2022 走看看