import threading import time from queue import Queue import requests from lxml import etree class QiuBaiSpider(object): # 1.爬取的的网站,和请求头 def __init__(self): self.base_url = 'https://www.qiushibaike.com/hot/page/{}/' self.headers = { 'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"} self.data = 0 self.url_queue = Queue() self.response_queue = Queue() self.data_queue = Queue() self.count = 0 # 2.爬取网站的url def get_url_list(self): for i in range(1, 13): url = self.base_url.format(i) self.url_queue.put(url) # 3.发送请求 def send_request(self): while True: url = self.url_queue.get() response = requests.get(url, headers=self.headers) self.response_queue.put(response) self.url_queue.task_done() # 4. 解析数据 def analysis_data(self): while True: data = self.response_queue.get().content self.count += 1 html_data = etree.HTML(data) div_list = html_data.xpath("""//*[@id="content-left"]/div""") for i in div_list: text = i.xpath('.//h2/text()')[0] self.data += 1 self.data_queue.put(text) self.response_queue.task_done() # 5.存储 def write_file(self): while True: data = self.data_queue.get() self.data_queue.task_done() def _start(self): th_list = [] # 获取url th_url = threading.Thread(target=self.get_url_list) th_list.append(th_url) # 发起请求 for i in range(2): th_send = threading.Thread(target=self.send_request) th_list.append(th_send) # 解析数据 th_analysis = threading.Thread(target=self.analysis_data) th_list.append(th_analysis) th_save = threading.Thread(target=self.write_file) th_list.append(th_save) print(th_list) # 开启线程保护,和开启线程 for th in th_list: th.setDaemon(True) th.start() # 开启队列阻塞 for q in [self.url_queue, self.response_queue, self.data_queue]: q.join() def run(self): start = time.time() self._start() end = time.time() print(end - start, "结束时间") print(self.data) if __name__ == '__main__': qiu_bai = QiuBaiSpider() qiu_bai.run()