zoukankan html css js c++ java

Python 多线程、线程池、协程爬虫

多线程生产者消费者模型爬虫

import queue

import requests
from bs4 import BeautifulSoup
import threading
import time
import random


def craw(url):
    r = requests.get(url=url)
    return r.text


def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="post-time-title")
    return [(link["href"], link.get_test()) for link in links]


def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
    while True:
        url = url_queue.get()
        html = craw(url)
        html_queue.put(html)
        print(threading.current_thread().name, url)
        time.sleep(random.randint(1,2))


def do_parse(html_queue:queue.Queue, f_out):
    while True:
        html = html_queue.get()
        results = parse(html)
        for result in results:
            f_out.write(str(result) + "
")
        print(threading.current_thread().name, html_queue.qsize())
        time.sleep(1)


if __name__ == '__main__':
    url_queue = queue.Queue()
    html_queue = queue.Queue()
    for url in ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]:
        url_queue.put(url)

    for idx in range(3):
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw-{idx}")
        t.start()

    file = open("02.data.txt", "w")
    for idx in range(2):
        d = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse-{idx}")
        d.start()

多线程池爬虫

from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup

spider_url = ["https://www.cnblogs.com/#p{}".format(i) for i in range(1, 25)]


def craw(url):
    r = requests.get(url=url)
    return r.text


def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="post-time-title")
    return [(link["href"], link.get_test()) for link in links]


# craw
with ThreadPoolExecutor() as pool:
    htmls = pool.map(craw, spider_url)
    htmls = list(zip(spider_url, htmls))
    for k, v in htmls:
        print(k, len(v))


with ThreadPoolExecutor() as pool:
    futures = {}
    for url, html in htmls:
        future = pool.submit(parse, html)
        futures[future] = url

    # for k, v in futures.items():
    #     print(v, k.result())
    for future in as_completed(futures):
        print(futures[future], future.result())

协程

import asyncio
import aiohttp

spider_url = ["https://www.cnblogs.com/taozhengquan/p/14966535.html"]*50

# 信号量控制爬虫数量
semaphore = asyncio.Semaphore(10)


async def async_craw(url):
    async with semaphore:
        print("craw url:", url)
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                result = await resp.text()
                print(url, len(result))


loop = asyncio.get_event_loop()
tasks = [
    loop.create_task(async_craw(item)) for item in spider_url
]
loop.run_until_complete(asyncio.wait(tasks))

此时此刻，非我莫属

查看全文

相关阅读:
UVA307Sticksdfs+剪枝
 UVA 10382 Watering Grass 贪心+区间覆盖问题
 H265网页视频播放器播放H265编码录像视频文件如何减缓卡顿情况？
RTMP/RTSP/GB28181协议视频平台授权页面配置完成后，控制台出现报错如何解决？
通过私有化部署自建一套视频流媒体服务器平台，如何解决视频播放延时卡顿问题？
RTSP/RTMP/HTTP/HLS协议视频流媒体播放器EasyPlayerRTSP安卓版本，切换视频流黑屏怎么办？
【解决方案】设备众多/品牌不一/协议复杂的需求下如何搭建智慧工地视频集中管控平台，实现建筑工程新形态？
RTMP/RTSP/GB28181协议视频平台授权页面开发中数字框输入字母BUG解决方法
 国标GB28181流媒体协议EasyGBS视频平台集成第三方平台播放器，显示logo水印如何去除？
通过私有化部署自建视频流媒体服务器，不同视频流延时说明

原文地址：https://www.cnblogs.com/taozhengquan/p/15254297.html

Python 多线程、线程池、协程 爬虫

多线程生产者消费者模型爬虫

多线程池爬虫

协程

Python 多线程、线程池、协程爬虫