zoukankan      html  css  js  c++  java
  • 【爬虫】多线程爬取表情包

    '''
    利用多线程、队列爬取表情包
    URL:http://www.bbsnet.com/doutu/page/1
    '''
    
    import requests
    from lxml import etree
    import os
    import re
    from urllib import request
    from queue import Queue
    import threading
    
    
    class Producer(threading.Thread):
        '''
        用于请求和解析网页,将下载地址及文件名放入队列
        '''
        def __init__(self,url_queue,img_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
                }
                response = requests.get(url, headers=headers)
                response.encoding = response.apparent_encoding
                text = response.text
                html = etree.HTML(text)
                imgEle = html.xpath('//div[@class="tagbqppdiv"]//img')
                for img in imgEle:
                    title = img.get('title')
                    img_url = img.get('data-original')
    
                    # 将title的中文字符进行替换处理
                    title = re.sub(r'[-+*.?。,!?、/()“”">::]*', '', title)
    
                    # os.path.splitext() 函数将文件路径和文件名分开
                    new_title = title + os.path.splitext(img_url)[1]
    
                    # 将文件名和图片的url放到队列
                    self.img_queue.put((new_title,img_url))
    
    class Consumer(threading.Thread):
        '''
        用于下载图片到本地
        '''
        def __init__(self, url_queue, img_queue, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.url_queue = url_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.img_queue.empty() and self.url_queue.empty():
                    break
                new_title, img_url = self.img_queue.get()
    
                # 下载图片
                request.urlretrieve(img_url,"./image/"+new_title)
                print(new_title + " 下载完成!")
    
    def main():
        url_queue = Queue(100)
        img_queue = Queue(500)
    
        url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html"
        for i in range(1,101):
            new_url = url.format(i)
            url_queue.put(new_url)
    
        for i in range(5):
            p = Producer(url_queue,img_queue)
            p.start()
    
        for i in range(5):
            c = Consumer(url_queue,img_queue)
            c.start()
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    关于Maya Viewport 2.0 API 开发的介绍视频
    春节大假
    Some tips about the life cycle of Maya thread pool
    Can I compile and run Dx11Shader for Maya 2015 on my side?
    How to get current deformed vertex positions in MoBu?
    想加入全球首届的 欧特克云加速计划吗?
    三本毕业(非科班),四次阿里巴巴面试,终拿 offer(大厂面经)
    mac、window版编辑器 webstorm 2016... 永久破解方法。
    node 搭载本地代理,处理web本地开发跨域问题
    js 一维数组,转成嵌套数组
  • 原文地址:https://www.cnblogs.com/st-st/p/10410593.html
Copyright © 2011-2022 走看看