zoukankan      html  css  js  c++  java
  • 多线程爬取斗图图片

    结果演示

    
     

    代码:

    #encoding:utf-8
    # __author__ = 'donghao'
    # __time__ = 2018/12/24 15:20
    import requests
    import threading
    import urllib.request
    import urllib3
    import os
    import re
    import time
    from lxml import etree
    from queue import Queue
    
    #负责解析图片
    class Producer(threading.Thread):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'
        }
    
        def __init__(self,img_queue,page_queue,*args,**kwargs):
            super(Producer, self).__init__(*args,**kwargs)
            self.img_queue = img_queue
            self.page_queue = page_queue
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            resp = requests.get(url=url,headers=self.headers)
            text = resp.text
            html = etree.HTML(text)
            imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
            for img in imgs:
                #获取图片url
                img_url = img.get('data-original')
                #获取图片Url的后缀名
                end = os.path.splitext(img_url)[1]
                #替换掉url中特殊字符
                end = re.sub(r'[,。??,/\·]','',end)
                # 获取图片描述,并加上后缀
                name = img.get('alt')+end
                
                #存储图片url和文件名队列
                self.img_queue.put((img_url,name))
                
    #负责下载图片
    class Consumer(threading.Thread):
        def __init__(self,img_queue,page_queue,*args,**kwargs):
            super(Consumer, self).__init__(*args,**kwargs)
            self.img_queue = img_queue
            self.page_queue = page_queue
        def run(self):
            while True:
                if self.img_queue.empty() and self.page_queue.empty():
                    break
                img_url,filename = self.img_queue.get()
                urllib.request.urlretrieve(img_url, 'images/' + filename)
                print(filename+'张图片下载完成')
    
    def main():
        #爬取10页
        start = time.time()
    
        image_queue = Queue(1000)
        page_queue = Queue(100)
        tsk = []
        for x in range(1,10):
            url = 'http://www.doutula.com/photo/list/?page=%d'%x
            #存储页面信息
            page_queue.put(url)
    
        for x in range(5):
            t = Producer(image_queue,page_queue)
            t.start()
            tsk.append(t)
    
        for x in range(5):
            t = Consumer(image_queue,page_queue)
            t.start()
            tsk.append(t)
            
        #终止运行,统计时间
        for t in tsk:
            t.join()
    
        end = time.time()
        print('耗时:%0.002fs' % (end - start))
    
    
    if __name__ == '__main__':
        main()
    
    
  • 相关阅读:
    博客园停更...
    Linux-常用命令汇总
    Linux-目录结构
    Mysql-python连接操作数据库
    Mysql-概念及常用命令
    Mysql-Sql查询汇总
    Mysql-Sql增删改查
    Mysql-Navicat破解使用
    Mysql-环境配置及问题解决
    Fiddler-AutoResponder替换资源
  • 原文地址:https://www.cnblogs.com/donghaoblogs/p/10389698.html
Copyright © 2011-2022 走看看