zoukankan      html  css  js  c++  java
  • Python 多线程、线程池、协程 爬虫

    多线程生产者消费者模型爬虫

    import queue
    
    import requests
    from bs4 import BeautifulSoup
    import threading
    import time
    import random
    
    
    def craw(url):
        r = requests.get(url=url)
        return r.text
    
    
    def parse(html):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", class_="post-time-title")
        return [(link["href"], link.get_test()) for link in links]
    
    
    def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
        while True:
            url = url_queue.get()
            html = craw(url)
            html_queue.put(html)
            print(threading.current_thread().name, url)
            time.sleep(random.randint(1,2))
    
    
    def do_parse(html_queue:queue.Queue, f_out):
        while True:
            html = html_queue.get()
            results = parse(html)
            for result in results:
                f_out.write(str(result) + "
    ")
            print(threading.current_thread().name, html_queue.qsize())
            time.sleep(1)
    
    
    if __name__ == '__main__':
        url_queue = queue.Queue()
        html_queue = queue.Queue()
        for url in ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]:
            url_queue.put(url)
    
        for idx in range(3):
            t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw-{idx}")
            t.start()
    
        file = open("02.data.txt", "w")
        for idx in range(2):
            d = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse-{idx}")
            d.start()
    

    多线程池爬虫

    from concurrent.futures import ThreadPoolExecutor, as_completed
    import requests
    from bs4 import BeautifulSoup
    
    spider_url = ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]
    
    
    def craw(url):
        r = requests.get(url=url)
        return r.text
    
    
    def parse(html):
        soup = BeautifulSoup(html, "html.parser")
        links = soup.find_all("a", class_="post-time-title")
        return [(link["href"], link.get_test()) for link in links]
    
    
    # craw
    with ThreadPoolExecutor() as pool:
        htmls = pool.map(craw, spider_url)
        htmls = list(zip(spider_url, htmls))
        for k, v in htmls:
            print(k, len(v))
    
    
    with ThreadPoolExecutor() as pool:
        futures = {}
        for url, html in htmls:
            future = pool.submit(parse, html)
            futures[future] = url
    
        # for k, v in futures.items():
        #     print(v, k.result())
        for future in as_completed(futures):
            print(futures[future], future.result())
    

    协程

    import asyncio
    import aiohttp
    
    spider_url = ["https://www.cnblogs.com/taozhengquan/p/14966535.html"]*50
    
    # 信号量控制爬虫数量
    semaphore = asyncio.Semaphore(10)
    
    
    async def async_craw(url):
        async with semaphore:
            print("craw url:", url)
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as resp:
                    result = await resp.text()
                    print(url, len(result))
    
    
    loop = asyncio.get_event_loop()
    tasks = [
        loop.create_task(async_craw(item)) for item in spider_url
    ]
    loop.run_until_complete(asyncio.wait(tasks))
    
    
    此时此刻,非我莫属
  • 相关阅读:
    Spark——为数据分析处理提供更为灵活的赋能
    秋读|10本热门图书(人工智能、编程开发、架构、区块链等)免费送!
    使用Phaser开发你的第一个H5游戏(一)
    Java web 服务启动时Xss溢出异常处理笔记
    为什么我打的jar包没有注解?
    收集、分析线上日志数据实战——ELK
    阿里云PolarDB及其共享存储PolarFS技术实现分析(下)
    14.5 富文本编辑【JavaScript高级程序设计第三版】
    【收藏】15个常用的javaScript正则表达式
    C# 网络请求
  • 原文地址:https://www.cnblogs.com/taozhengquan/p/15254297.html
Copyright © 2011-2022 走看看