1 import requests 2 from lxml import etree 3 from urllib import request 4 import re 5 import os 6 from queue import Queue 7 import threading 8 9 10 ''' 11 这个程序有bug 12 ''' 13 class Produce(threading.Thread): 14 headers = { 15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' 16 } 17 def __init__(self, page_queue, image_queue, *args, **kwargs): 18 super(Produce, self).__init__(*args, **kwargs) 19 self.page_queue = page_queue 20 self.image_queue = image_queue 21 22 def run(self): 23 while 1: 24 if self.page_queue.empty(): 25 break 26 url = self.page_queue.get() 27 self.parse_page(url) 28 29 def parse_page(self,url): 30 resp = requests.get(url, headers =self.headers) 31 html = etree.HTML(resp.text) 32 all_pic = html.xpath('//div[@class="col-xs-6 col-sm-3"]') 33 for pic in all_pic: 34 pic_url = pic.xpath('.//img//@data-original')[0] 35 pic_name = pic.xpath('.//img//@alt')[0] 36 pic_name =re.sub(r'[??.。,!!]', '', pic_name) 37 kuozhanming = os.path.splitext(pic_url)[1] 38 filmname = r'G:picktrue'+'\'+pic_name+kuozhanming 39 self.image_queue.put((pic_url, filmname)) 40 41 42 class Consumer(threading.Thread): 43 def __init__(self, page_queue, image_queue, *args, **kwargs): 44 super(Consumer, self).__init__(*args, **kwargs) 45 self.page_queue = page_queue 46 self.image_queue = image_queue 47 48 def run(self): 49 while 1: 50 if self.image_queue.empty() and self.page_queue.empty(): 51 break 52 pic_url, filmname = self.image_queue.get() 53 request.urlretrieve(pic_url, filmname) 54 55 56 def main(): 57 page_queue= Queue(20) 58 image_queue = Queue(1000) 59 for x in range(1,3): 60 url = 'http://www.doutula.com/article/list/?page=%d'%x 61 page_queue.put(url) 62 for x in range(3): 63 t = Produce(page_queue, image_queue) 64 t.start() 65 for x in range(3): 66 t = Consumer(page_queue, image_queue) 67 t.start() 68 69 if __name__ == '__main__': 70 main()