zoukankan      html  css  js  c++  java
  • 糗事百科_基于队列和多线程

    import threading
    import time
    from queue import Queue
    
    import requests
    from lxml import etree
    
    
    class QiuBaiSpider(object):
        # 1.爬取的的网站,和请求头
        def __init__(self):
            self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
            self.headers = {
                'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
            self.data = 0
            self.url_queue = Queue()
            self.response_queue = Queue()
            self.data_queue = Queue()
            self.count = 0
    
        # 2.爬取网站的url
        def get_url_list(self):
            for i in range(1, 13):
                url = self.base_url.format(i)
                self.url_queue.put(url)
    
        # 3.发送请求
        def send_request(self):
            while True:
                url = self.url_queue.get()
                response = requests.get(url, headers=self.headers)
                self.response_queue.put(response)
                self.url_queue.task_done()
    
        # 4. 解析数据
        def analysis_data(self):
            while True:
                data = self.response_queue.get().content
                self.count += 1
                html_data = etree.HTML(data)
                div_list = html_data.xpath("""//*[@id="content-left"]/div""")
                for i in div_list:
                    text = i.xpath('.//h2/text()')[0]
                    self.data += 1
                    self.data_queue.put(text)
                self.response_queue.task_done()
    
        # 5.存储
        def write_file(self):
            while True:
                data = self.data_queue.get()
                self.data_queue.task_done()
    
        def _start(self):
            th_list = []
            # 获取url
            th_url = threading.Thread(target=self.get_url_list)
            th_list.append(th_url)
    
            # 发起请求
            for i in range(2):
                th_send = threading.Thread(target=self.send_request)
                th_list.append(th_send)
    
            # 解析数据
            th_analysis = threading.Thread(target=self.analysis_data)
            th_list.append(th_analysis)
    
            th_save = threading.Thread(target=self.write_file)
            th_list.append(th_save)
            print(th_list)
            # 开启线程保护,和开启线程
            for th in th_list:
                th.setDaemon(True)
                th.start()
            # 开启队列阻塞
            for q in [self.url_queue, self.response_queue, self.data_queue]:
                q.join()
    
        def run(self):
            start = time.time()
            self._start()
            end = time.time()
            print(end - start, "结束时间")
            print(self.data)
    
    
    if __name__ == '__main__':
        qiu_bai = QiuBaiSpider()
        qiu_bai.run()
  • 相关阅读:
    RabbitMQ in Action(5): Clustering and dealing with failure
    RabbitMQ in Action (2): Running and administering Rabbit
    [转]Setting Keystone v3 domains
    Openstack中RabbitMQ RPC代码分析
    RabbitMQ in Action (1): Understanding messaging
    [转]Understanding OpenStack Authentication: Keystone PKI
    neutron的基本原理
    nova vnc proxy基本原理
    sersync+rsync做实时同步
    使用rsync备份数据
  • 原文地址:https://www.cnblogs.com/ls1997/p/11276819.html
Copyright © 2011-2022 走看看