zoukankan      html  css  js  c++  java
  • 糗事百科爬虫_基于线程池

    import threading
    import time
    from queue import Queue
    from multiprocessing.dummy import Pool
    import requests
    from lxml import etree
    
    
    class QiuBaiSpider(object):
        # 1.爬取的的网站,和请求头
        def __init__(self):
            self.base_url = 'https://www.qiushibaike.com/hot/page/{}/'
            self.headers = {
                'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}
            self.data = 0
            self.pool = Pool(processes=4)
            self.url_queue = Queue()
            self.count = 0
            self.request = 0
            self.response = 0
            self.is_finish = False
    
        # 2.爬取网站的url
        def get_url_list(self):
            for i in range(1, 13):
                url = self.base_url.format(i)
                self.url_queue.put(url)
                self.request += 1
    
        # 3.发送请求
        def send_request(self, url):
            print(url)
            response = requests.get(url, headers=self.headers)
            return response
    
        # 4. 解析数据
        def analysis_data(self, data):
            data = data.content
            self.count += 1
            html_data = etree.HTML(data)
            div_list = html_data.xpath("""//*[@id="content-left"]/div""")
            for i in div_list:
                text = i.xpath('.//h2/text()')[0]
                self.data += 1
                self.write_file(text)
    
        # 5.存储
        def write_file(self, data):
            print(data)
    
        def _start(self):
            url = self.url_queue.get()
            data = self.send_request(url)
            self.analysis_data(data)
            self.response += 1
    
        def _callback(self, temp):
            self.pool.apply_async(self._start, callback=self._callback)
    
        def async_start(self):
            self.get_url_list()
            for i in range(4):
                self.pool.apply_async(self._start, callback=self._callback)
            while True:
                time.sleep(0.0001)
                if self.response >= self.request:
                    self.is_finish = True
                    break
    
        def run(self):
            start = time.time()
            self.async_start()
            end = time.time()
            print(end - start, "结束时间")
            print(self.data)
    
    
    if __name__ == '__main__':
        qiu_bai = QiuBaiSpider()
        qiu_bai.run()
  • 相关阅读:
    new delete的内部实现代码
    子串的替换
    求字符串的长度
    TSQL语句学习(四)
    TSQL语句学习(二)
    杭电acm1036
    杭电acm2032
    杭电acm2045
    杭电acm2072
    杭电acm1029
  • 原文地址:https://www.cnblogs.com/ls1997/p/11284523.html
Copyright © 2011-2022 走看看