zoukankan      html  css  js  c++  java
  • 生产者消费者模式

    通用代码

    import threading
    import requests
    from lxml import etree
    import os
    from urllib import request
    from queue import Queue
    
    
    class Producer(threading.Thread):
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
        }
    
        def __init__(self, page_queue, img_queue, *args, **kwargs):
            super(Producer, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self, url):
            response = requests.get(url=url,headers=self.headers)
            text = response.text
            html = etree.HTML(text)
    
            img_list = html.xpath('//div[@class="page-content text-center"]/div/a/img')
            for img in img_list:
                img_url = img.xpath('./@data-original')[0]
                img_name = img.xpath('./@alt')[0]+'.jpg'
                self.img_queue.put((img_url, img_name))
    
    
    
    
    class Consumer(threading.Thread):
        def __init__(self, page_queue, img_queue, *args, **kwargs):
            super(Consumer, self).__init__(*args, **kwargs)
            self.page_queue = page_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.page_queue.empty() and self.img_queue.empty():
                    break
                img_url, img_name = self.img_queue.get()
                request.urlretrieve(img_url, "./imgs/" + img_name)
                print(img_name + " 下载完成!")
    
    # 定义一个主方法,该方法向处理方法中传值
    def main():
        page_queue = Queue(50) #存储页码链接
        img_queue = Queue(100)#存储解析出来的图片链接
        #想要爬取前10页的数据
        for x in range(1, 11):
            url = "https://www.doutula.com/photo/list/?page=%d" % x
            page_queue.put(url) #将10页的页码链接加入到了page_queue
    
        for x in range(3):
            t = Producer(page_queue, img_queue)
            t.start()
    
        for x in range(3):
            t = Consumer(page_queue, img_queue)
            t.start()
    
    
    if __name__ == '__main__':
        main() 
    
  • 相关阅读:
    hdu 4614 线段树 二分
    cf 1066d 思维 二分
    lca 最大生成树 逆向思维 2018 徐州赛区网络预赛j
    rmq学习
    hdu 5692 dfs序 线段树
    dfs序介绍
    poj 3321 dfs序 树状数组 前向星
    cf 1060d 思维贪心
    【PAT甲级】1126 Eulerian Path (25分)
    【PAT甲级】1125 Chain the Ropes (25分)
  • 原文地址:https://www.cnblogs.com/weiweivip666/p/14028182.html
Copyright © 2011-2022 走看看