zoukankan      html  css  js  c++  java
  • 第二十二节 多线程爬取表情包

     1 import requests
     2 from lxml import etree
     3 from urllib import request
     4 import re
     5 import os
     6 from queue import Queue
     7 import threading
     8 
     9 
    10 '''
    11 这个程序有bug
    12 '''
    13 class Produce(threading.Thread):
    14     headers = {
    15         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    16     }
    17     def __init__(self, page_queue, image_queue, *args, **kwargs):
    18         super(Produce, self).__init__(*args, **kwargs)
    19         self.page_queue = page_queue
    20         self.image_queue = image_queue
    21 
    22     def run(self):
    23         while 1:
    24             if self.page_queue.empty():
    25                 break
    26             url = self.page_queue.get()
    27             self.parse_page(url)
    28 
    29     def parse_page(self,url):
    30         resp = requests.get(url, headers =self.headers)
    31         html = etree.HTML(resp.text)
    32         all_pic = html.xpath('//div[@class="col-xs-6 col-sm-3"]')
    33         for pic in all_pic:
    34             pic_url = pic.xpath('.//img//@data-original')[0]
    35             pic_name = pic.xpath('.//img//@alt')[0]
    36             pic_name =re.sub(r'[??.。,!!]', '', pic_name)
    37             kuozhanming = os.path.splitext(pic_url)[1]
    38             filmname = r'G:picktrue'+'\'+pic_name+kuozhanming
    39             self.image_queue.put((pic_url, filmname))
    40 
    41 
    42 class Consumer(threading.Thread):
    43     def __init__(self, page_queue, image_queue, *args, **kwargs):
    44         super(Consumer, self).__init__(*args, **kwargs)
    45         self.page_queue = page_queue
    46         self.image_queue = image_queue
    47 
    48     def run(self):
    49         while 1:
    50             if self.image_queue.empty() and self.page_queue.empty():
    51                 break
    52             pic_url, filmname = self.image_queue.get()
    53             request.urlretrieve(pic_url, filmname)
    54 
    55 
    56 def main():
    57     page_queue= Queue(20)
    58     image_queue = Queue(1000)
    59     for x in range(1,3):
    60         url = 'http://www.doutula.com/article/list/?page=%d'%x
    61         page_queue.put(url)
    62     for x in range(3):
    63         t = Produce(page_queue, image_queue)
    64         t.start()
    65     for x in range(3):
    66         t = Consumer(page_queue, image_queue)
    67         t.start()
    68 
    69 if __name__ == '__main__':
    70     main()
  • 相关阅读:
    Head first javascript(七)
    Python Fundamental for Django
    Head first javascript(六)
    Head first javascript(五)
    Head first javascript(四)
    Head first javascript(三)
    Head first javascript(二)
    Head first javascript(一)
    Sicily 1090. Highways 解题报告
    Python GUI programming(tkinter)
  • 原文地址:https://www.cnblogs.com/kogmaw/p/12507064.html
Copyright © 2011-2022 走看看