zoukankan      html  css  js  c++  java
  • 【爬虫】多线程爬取表情包

    '''
    利用多线程、队列爬取表情包
    URL:http://www.bbsnet.com/doutu/page/1
    '''
    
    import requests
    from lxml import etree
    import os
    import re
    from urllib import request
    from queue import Queue
    import threading
    
    
    class Producer(threading.Thread):
        '''
        用于请求和解析网页,将下载地址及文件名放入队列
        '''
        def __init__(self,url_queue,img_queue,*args,**kwargs):
            super().__init__(*args,**kwargs)
            self.url_queue = url_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.url_queue.empty():
                    break
                url = self.url_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
                headers = {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
                }
                response = requests.get(url, headers=headers)
                response.encoding = response.apparent_encoding
                text = response.text
                html = etree.HTML(text)
                imgEle = html.xpath('//div[@class="tagbqppdiv"]//img')
                for img in imgEle:
                    title = img.get('title')
                    img_url = img.get('data-original')
    
                    # 将title的中文字符进行替换处理
                    title = re.sub(r'[-+*.?。,!?、/()“”">::]*', '', title)
    
                    # os.path.splitext() 函数将文件路径和文件名分开
                    new_title = title + os.path.splitext(img_url)[1]
    
                    # 将文件名和图片的url放到队列
                    self.img_queue.put((new_title,img_url))
    
    class Consumer(threading.Thread):
        '''
        用于下载图片到本地
        '''
        def __init__(self, url_queue, img_queue, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.url_queue = url_queue
            self.img_queue = img_queue
    
        def run(self):
            while True:
                if self.img_queue.empty() and self.url_queue.empty():
                    break
                new_title, img_url = self.img_queue.get()
    
                # 下载图片
                request.urlretrieve(img_url,"./image/"+new_title)
                print(new_title + " 下载完成!")
    
    def main():
        url_queue = Queue(100)
        img_queue = Queue(500)
    
        url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html"
        for i in range(1,101):
            new_url = url.format(i)
            url_queue.put(new_url)
    
        for i in range(5):
            p = Producer(url_queue,img_queue)
            p.start()
    
        for i in range(5):
            c = Consumer(url_queue,img_queue)
            c.start()
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    JS调试debug
    避免使用 JS 特性 with(obj){}
    bit Byte KB MB GB TB 单位换算
    C语言中连接器介绍
    [bzoj3600]没有人的算术
    [bzoj4373]算术天才⑨与等差数列
    [bzoj4151][AMPPZ2014]The Cave
    [bzoj4906][BeiJing2017]喷式水战改
    [bzoj4908][BeiJing2017]开车
    [Codeforces Round#417 Div.2]
  • 原文地址:https://www.cnblogs.com/st-st/p/10410593.html
Copyright © 2011-2022 走看看