zoukankan      html  css  js  c++  java
  • 爬虫之多线程、异步

    1.使用传统方式爬取“斗图啦”网站的图片

    #-*-coding = utf-8 -*-
    import requests
    from lxml import etree
    import re
    import os.path
    from urllib import request
    
    def parse_page(url):
        headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        response = requests.get(url=url,headers=headers)
        text = response.text
        parse = etree.HTMLParser(encoding='utf-8')
        html = etree.fromstring(text,parser=parse)
        images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]')
        for img in images:
            img_url = img.get("data-original")
            img_name = img.get("alt")
            img_name = re.sub(r'[??!!./,,。]','',img_name)
            img_postfix = os.path.splitext(img_url)[1]
            img_save_path = os.path.join('E:study',img_name+img_postfix)
            request.urlretrieve(img_url,img_save_path)
    
    def main():
        for page in range(1,101):
            url = 'https://www.doutula.com/photo/list/?page=%d' %page
            parse_page(url)
           
    
    if __name__=='__main__':
        main()
    View Code

     2.使用生产者与消费者模式多线程下载表情包

    #-*-coding = utf-8 -*-
    import requests
    from lxml import etree
    import re
    import os.path
    from urllib import request
    import threading
    from queue import Queue
    
    class Producer(threading.Thread):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        def __init__(self,page_queue,image_queue,*args,**kwargs):
            super(Producer,self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.image_queue = image_queue
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            response = requests.get(url=url,headers=self.headers)
            text = response.text
            parse = etree.HTMLParser(encoding='utf-8')
            html = etree.fromstring(text,parser=parse)
            images = html.xpath('//div[@class="page-content text-center"]//a//img[@class!="gif"]')
            for img in images:
                img_url = img.get("data-original")
                img_name = img.get("alt")
                img_name = re.sub(r'[,。?!*,\.?/]','',img_name)
                img_postfix = os.path.splitext(img_url)[1]
                img_save_path = os.path.join('E:study',img_name+img_postfix)
                self.image_queue.put((img_url,img_save_path))
    class Consumer(threading.Thread):
        def __init__(self,page_queue,image_queue,*args,**kwargs):
            super(Consumer,self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.image_queue = image_queue
        def run(self):
            while True:
                if self.image_queue.empty() and self.page_queue.empty():
                    return
                img = self.image_queue.get(block=True)
                url,path = img
                request.urlretrieve(url,path)
                print(path + "下载完成!")
    
    
    def main():
        page_queue = Queue(100)
        img_queue = Queue(1000)
        for page in range(1,101):
            url = 'https://www.doutula.com/photo/list/?page=%d' %page
            page_queue.put(url)
        for x in range(5):
            producer = Producer(page_queue,img_queue)
            producer.start()
        for x in range(5):
            consumer = Consumer(page_queue,img_queue)
            consumer.start()
    
    if __name__=='__main__':
        main()
    View Code

     使用生产者消费者模式下载内涵段子并保存在csv文件

    #-*-coding = utf-8 -*-
    import requests
    from lxml import etree
    import threading
    from queue import Queue
    import csv
    
    class Producer(threading.Thread):
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
        def __init__(self,page_queue,joke_queue,*args,**kwargs):
            super(Producer,self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.joke_queue = joke_queue
    
        def run(self):
            while True:
                if self.page_queue.empty():
                    break
                url = self.page_queue.get()
                self.parse_page(url)
    
        def parse_page(self,url):
            response = requests.get(url=url,headers=self.headers)
            text = response.text
            parse = etree.HTMLParser(encoding='utf-8')
            html = etree.fromstring(text,parser=parse)
            descs = html.xpath("//div[@class='j-r-list-c-desc']//a")
            for a in descs:
                joke = a.xpath("text()")
                joke_content = "
    ".join(joke).strip()
                link = 'http://www.budejie.com'+ a.xpath("@href")[0]
                self.joke_queue.put((joke_content,link))
            print('=' * 30 + "第%s页下载完成!" % url.split('/')[-1] + "=" * 30)
    
    class Consumer(threading.Thread):
        def __init__(self,page_queue,joke_queue,csvfilepath,*args,**kwargs):
            super(Consumer,self).__init__(*args,**kwargs)
            self.page_queue = page_queue
            self.joke_queue = joke_queue
            self.lock = threading.Lock()#创建锁
            self.csvfilepath = csvfilepath
    
        def run(self):
            while True:
                if self.joke_queue.empty() and self.page_queue.empty():
                    return
                joke ,link= self.joke_queue.get()
                self.lock.acquire()
                self.writecsv((joke,link))
                self.lock.release()
    
        def writecsv(self,row):
            with open(self.csvfilepath, "a+", newline='', encoding='utf-8')as csvfile:
                writer = csv.writer(csvfile,dialect='excel')
                writer.writerow(row)
    
    def main():
        page_queue = Queue(10)
        joke_queue = Queue(1000)
        csvfilepath = r'E:studyjoke.csv'
    
        for page in range(1,11):
            url = 'http://www.budejie.com/text/%d' %page
            page_queue.put(url)
        for x in range(5):
            producer = Producer(page_queue,joke_queue)
            producer.start()
        for x in range(5):
            consumer = Consumer(page_queue,joke_queue,csvfilepath)
            consumer.start()
    
    if __name__=='__main__':
        main()
    View Code

    >>>>>>>>>>待续

  • 相关阅读:
    linux下动态链接库.so文件 静态链接库.a文件创建及使用
    matlab 自动阈值白平衡算法 程序可编译实现
    C++ 迭代器介绍 [转摘]
    C++ Primer 第三章 标准库类型vector+迭代器iterator 运算
    matlab灰度变彩色+白平衡算法实现
    我和奇葩的故事之失联第七天
    C++ Primer 第三章 标准库类型string运算
    OpenCV白平衡算法之灰度世界法(消除RGB受光照影响)
    查看网络情况netstat指令与动态监控top指令
    linux服务
  • 原文地址:https://www.cnblogs.com/wuxunyan/p/10648135.html
Copyright © 2011-2022 走看看