zoukankan      html  css  js  c++  java
  • 爬虫_斗图啦(队列,多线程)

     1 import threading
     2 import requests
     3 from lxml import etree
     4 from urllib import request
     5 import os
     6 import re
     7 from queue import Queue
     8 
     9 
    10 class Producer(threading.Thread):
    11     headers = {
    12         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    13     }
    14     def __init__(self,page_queue,img_queue,*args,**kwargs):
    15         super(Producer, self).__init__(*args,**kwargs)
    16         self.page_queue = page_queue
    17         self.img_queue = img_queue
    18 
    19 
    20     def run(self):
    21         while True:
    22             if self.page_queue.empty():
    23                 break
    24             url = self.page_queue.get()
    25             self.parse_page(url)
    26 
    27 
    28     def parse_page(self,url):
    29         response = requests.get(url,headers=self.headers)
    30         text = response.text
    31         html = etree.HTML(text)
    32         imgs = html.xpath("//div[@class='page-content text-center']//a//img")
    33         for img in imgs:
    34             if img.get('class') == 'gif':
    35                 continue
    36             img_url = img.xpath(".//@data-original")[0]
    37             suffix = os.path.splitext(img_url)[1]
    38             alt = img.xpath(".//@alt")[0]
    39             alt = re.sub(r'[,。??,/\·]','',alt)
    40             img_name = alt + suffix
    41             self.img_queue.put((img_url,img_name))
    42 
    43 
    44 class Consumer(threading.Thread):
    45     def __init__(self,page_queue,img_queue,*args,**kwargs):
    46         super(Consumer, self).__init__(*args,**kwargs)
    47         self.page_queue = page_queue
    48         self.img_queue = img_queue
    49 
    50 
    51     def run(self):
    52         while True:
    53             if self.img_queue.empty():
    54                 if self.page_queue.empty():
    55                     return
    56             img = self.img_queue.get(block=True)
    57             url,filename = img
    58             request.urlretrieve(url,'images/'+filename)
    59             print(filename+'  下载')
    60 
    61 
    62 def main():
    63     page_queue = Queue(100)
    64     img_queue = Queue(500)
    65 
    66     for x in range(1,101):
    67         url = "http://www.doutula.com/photo/list/?page=%d" % x
    68         page_queue.put(url)
    69     for x in range(5):
    70         t = Producer(page_queue,img_queue)
    71         t.start()
    72     for x in range(5):
    73         t = Consumer(page_queue,img_queue)
    74         t.start()
    75 
    76 
    77 if __name__ == '__main__':
    78     main()

    下载是相当快啊

  • 相关阅读:
    P1891 疯狂LCM
    P2568 GCD
    P1516 青蛙的约会和P2421 [NOI2002]荒岛野人
    P4168 蒲公英
    P5960 差分约束算法模板
    P2024 食物链(种类并查集)
    CF1328E Tree Queries
    CF1328B K-th Beautiful String
    dij-spfa乱搞
    P1993 小K的农场
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/9459640.html
Copyright © 2011-2022 走看看