zoukankan      html  css  js  c++  java
  • 多线程爬取新闻标题和链接

    新闻分页地址:https://news.cnblogs.com/n/page/10/;url中最后一个数字代表页码

    from concurrent.futures import ThreadPoolExecutor
    import threading
    import time
    from queue import Queue
    import logging
    import requests
    from bs4 import BeautifulSoup
    
    # 日志参数的设定
    FORMAT = "%(asctime)s %(threadName)s %(thread)d %(message)s"
    logging.basicConfig(format=FORMAT, level=logging.INFO)
    
    # 多线程对象
    event = threading.Event()
    
    # url的前缀和user-agent值的设定
    base_url = 'https://news.cnblogs.com'
    page_path = '/n/page/'
    ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    
    # 队列对象
    urls = Queue()  # 待爬取队列,省略已爬取队列
    htmls = Queue()  # 从原网页里爬取的全部html内容:太大,无用数据太多,不保存
    outputs = Queue()  # 提取的数据,结果输出队列
    
    # 1.创建urls;周而复始的创建要爬取的url;start表示起始页面,stop表示终止页面
    def create_urls(start, stop, step=1):
        for i in range(start, stop+1, step):
            url = "{}{}{}/".format(base_url, page_path, i)
            # print(url)
            urls.put(url)  # 将生成的url放入待爬取的url的队列里
            print('url创建完毕')
    
    # create_urls(1,10)  # 创建page1到page10的url
    # print(urls.qsize())  # 队列的大小为10
    
    # 2.使用url来发起request请求,返回response对象
    def crawler():  # 多线程
        while not event.is_set():
            try:
                url = urls.get(True, 1)  # 设置超时时间为1秒
                response = requests.get(url, headers={'User-agent': ua})
                with response:
                    html = response.text  # 异步方式获取文本信息
                    htmls.put(html)  # 将每个页面内容存放进对应的htmls队列里
                    print('url:', url)
            # 捕获超时抛出的错误
            except Exception as e:
                print(e)
                # logging.error(e)
    
    
    # 3.分析提取有用的数据入库
    def parse():
        while not event.is_set():
            try:
                html = htmls.get(True, 1) 
                soup = BeautifulSoup(html, 'lxml')  # 解析html内容
    
                news = soup.select('h2.news_entry a')  # 提取所需标签内容
                for n in news:
                    title = n.text
                    ref = base_url + n.attrs.get('href')
                    print('get_title:', title, 'get_ref:', ref)
                    outputs.put((title, ref))  # 提取出的标题和链接内容存放至对应队列里
    
            except Exception as e:
                print(e)
                # logging.error(e)
    
    # 4.入库;保存到文件中
    def save(path):
        with open(path, 'a+', encoding='utf-8') as f:
            while not event.is_set():
                try:
                    title, ref = outputs.get(True, 1)  # 元组结构
                    print('save_title:', title, 'save_ref:', ref)
                    f.write('{}_{}
    '.format(title, ref))
                    f.flush()  # 爬取内容保存到文件中
                except Exception as e:
                    print(e)
                    # logging.error(e)
    
    # 线程池中,启动线程(最大线程数为10)
    executor = ThreadPoolExecutor(max_workers=10)
    executor.submit(create_urls, 1, 10)  # 起始urls,以后queue中parse有用的url也可以加入
    executor.submit(parse)
    executor.submit(save, 'news.txt')
    
    for i in range(7):
        executor.submit(crawler)
    
    
    while True:
        cmd = input('>>>')
        if cmd.strip() == 'q':  # 在console栏里输入q,就会过一秒后停止多线程运行
            event.set()
            executor.shutdown()
            print('closing')
            time.sleep(1)
            break
  • 相关阅读:
    二月12日
    十日冲刺
    十日冲刺
    十日冲刺
    十日冲刺
    十日冲刺
    十日冲刺
    十日冲刺
    一周进度条博客
    十天冲刺
  • 原文地址:https://www.cnblogs.com/hongdanni/p/10573858.html
Copyright © 2011-2022 走看看