新闻分页地址:https://news.cnblogs.com/n/page/10/;url中最后一个数字代表页码
from concurrent.futures import ThreadPoolExecutor import threading import time from queue import Queue import logging import requests from bs4 import BeautifulSoup # 日志参数的设定 FORMAT = "%(asctime)s %(threadName)s %(thread)d %(message)s" logging.basicConfig(format=FORMAT, level=logging.INFO) # 多线程对象 event = threading.Event() # url的前缀和user-agent值的设定 base_url = 'https://news.cnblogs.com' page_path = '/n/page/' ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' # 队列对象 urls = Queue() # 待爬取队列,省略已爬取队列 htmls = Queue() # 从原网页里爬取的全部html内容:太大,无用数据太多,不保存 outputs = Queue() # 提取的数据,结果输出队列 # 1.创建urls;周而复始的创建要爬取的url;start表示起始页面,stop表示终止页面 def create_urls(start, stop, step=1): for i in range(start, stop+1, step): url = "{}{}{}/".format(base_url, page_path, i) # print(url) urls.put(url) # 将生成的url放入待爬取的url的队列里 print('url创建完毕') # create_urls(1,10) # 创建page1到page10的url # print(urls.qsize()) # 队列的大小为10 # 2.使用url来发起request请求,返回response对象 def crawler(): # 多线程 while not event.is_set(): try: url = urls.get(True, 1) # 设置超时时间为1秒 response = requests.get(url, headers={'User-agent': ua}) with response: html = response.text # 异步方式获取文本信息 htmls.put(html) # 将每个页面内容存放进对应的htmls队列里 print('url:', url) # 捕获超时抛出的错误 except Exception as e: print(e) # logging.error(e) # 3.分析提取有用的数据入库 def parse(): while not event.is_set(): try: html = htmls.get(True, 1) soup = BeautifulSoup(html, 'lxml') # 解析html内容 news = soup.select('h2.news_entry a') # 提取所需标签内容 for n in news: title = n.text ref = base_url + n.attrs.get('href') print('get_title:', title, 'get_ref:', ref) outputs.put((title, ref)) # 提取出的标题和链接内容存放至对应队列里 except Exception as e: print(e) # logging.error(e) # 4.入库;保存到文件中 def save(path): with open(path, 'a+', encoding='utf-8') as f: while not event.is_set(): try: title, ref = outputs.get(True, 1) # 元组结构 print('save_title:', title, 'save_ref:', ref) f.write('{}_{} '.format(title, ref)) f.flush() # 爬取内容保存到文件中 except Exception as e: print(e) # logging.error(e) # 线程池中,启动线程(最大线程数为10) executor = ThreadPoolExecutor(max_workers=10) executor.submit(create_urls, 1, 10) # 起始urls,以后queue中parse有用的url也可以加入 executor.submit(parse) executor.submit(save, 'news.txt') for i in range(7): executor.submit(crawler) while True: cmd = input('>>>') if cmd.strip() == 'q': # 在console栏里输入q,就会过一秒后停止多线程运行 event.set() executor.shutdown() print('closing') time.sleep(1) break