https://www.cnblogs.com/alamZ/p/7414020.html 课件内容
#_*_ coding: utf-8 _*_ ''' Created on 2018年7月17日 @author: sss function: 利用多线程爬取糗事百科页面 ''' #使用线程库 import threading #队列 from queue import Queue #解析库 from lxml import etree #json处理 import json import time import requests from pickle import FALSE CRAWL_EXIT = False PARSE_EXIT = False class TreadCrawl(threading.Thread): def __init__(self, threadName, pageQueue, dataQueue): #threading.Thread.__init__(self) #掉用父类初始化方法 super(TreadCrawl, self).__init__() #线程名 self.threadName = threadName #页码队列 self.pageQueue = pageQueue #数据队列 self.dataQueue = dataQueue #请求报头 self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"} def run(self): print('启动' + self.threadName) while not CRAWL_EXIT: try: #取出一个数字,先进先出 #课选参数block,默认值为True #1.如果对列为空,block为True的话,不会结束,会进入阻塞转态,直到队列有新的数据 #2.如果队列为空,block为false的话,就弹出一个Queue.empty()异常 page = self.pageQueue.get(False) url = "http://www.qiushibaike.com/8hr/page/" + str(page) +"/" content = requests.get(url, headers = self.headers).text time.sleep(1) self.dataQueue.put(content) except: pass print('结束' + self.threadName) class ThreadParse(threading.Thread): def __init__(self, threadName, dataQueue, filename, lock): # print('parse') super(ThreadParse, self).__init__() #线程名 self.threadName = threadName #数据队列 self.dataQueue = dataQueue #保存解析后数据的文件名 self.filename = filename #锁 self.lock = lock def run(self): print('启动' + self.threadName) while not PARSE_EXIT: try: html = self.dataQueue.get(False) # print('0000000000000000000') self.parse(html) except: pass print('退出' + self.threadName) def parse(self, html): #解析为HTML DOM html = etree.HTML(html) node_list = html.xpath('//div[contains(@id, "qiushi_tag_")]') # print(node_list) # print('6666666666666') items = {} for node in node_list: #用户名 # username = node.xpath('./div/a/h2')[0].text username = node.xpath('.//h2')[0].text.replace(' ','') #用replace去掉 #图片连接 image = node.xpath('.//img/@src')#[0] #取出标题下的内容 content = node.xpath('./a/div/span')[0].text.replace(' ','') #点赞 zan = node.xpath('./div/span/i')[0].text #评论 comment = node.xpath('./div/span/a/i')[0].text items = { 'username' : username, 'image' : image, 'content' : content, 'zan' : zan, 'comments' : comment } # with 后面有两个必须执行的操作:__enter__ 和 __exit__ # 不管里面的操作结果如何,都会执行打开、关闭 # 打开锁、处理内容、释放锁 with self.lock: # 写入存储的解析后的数据 self.filename.write(json.dumps(items, ensure_ascii = False) + " ") print('已写入') def main(): #页码队列,表示20个页面 pageQueue = Queue(20) #放入1~20的数字,先进先出 for i in range(1, 5): pageQueue.put(i) #采集的结果(每页的html原码)的数据队列,参数为空表示不限制 dataQueue = Queue() #打开文件 filename = open('./qiushi/duanzi.json', 'a', encoding='utf-8') #创建锁: lock = threading.Lock() #三个采集的名字 crawlList = ['采集线程 1号','采集线程 2号','采集线程 3号'] #存储三个采集线程的列表集合 threadcrawl = [] for threadName in crawlList: thread = TreadCrawl(threadName, pageQueue, dataQueue) thread.start() threadcrawl.append(thread) #三个解析线程的名字 parseList = ['解析线程1号', '解析线程2号', '解析线程3号'] #存储三个解析线程 threadparse = [] for threadName in parseList: thread = ThreadParse(threadName, dataQueue, filename, lock) thread.start() threadparse.append(thread) #等待pageQueue队列为空,也就是等待之前的操作执行完毕, while not pageQueue.empty(): pass #如果pageQueue为空,采集线程退出循环 global CRAWL_EXIT CRAWL_EXIT = True print('pageQueue为空') for thread in threadcrawl: thread.join() print(1) while not dataQueue.empty(): pass global PARSE_EXIT PARSE_EXIT = True for thread in threadparse: thread.join() print('2') with lock: #关闭文件 filename.close() print('谢谢使用!') if __name__ == "__main__": main()