zoukankan      html  css  js  c++  java
  • 爬虫之单线程多任务异步抓取

    协程

    import asyncio
    import time
    #定义了一个特殊的函数
    #特殊:调用后会返回一个协程对象,且函数内部的实现语句不会被立即执行
    #创建一个协程对象
    # async def test(num):
    #     print(num)
    #
    # c = test(10)
    # print(c)
    
    
    #封装一个任务对象
    # async def test(num):
    #     print(num)
    #
    # c = test(10)
    # #根据协程对象封装了一个任务对象
    # task = asyncio.ensure_future(c)
    # print(task)
    
    #事件循环对象
    async def request(url):
        print('正在请求:',url)
        time.sleep(2)
        print('请求完毕!',url)
    
    
    c1 = request('www.1.com')
    
    task_A = asyncio.ensure_future(c1)
    
    
    #创建一个事件循环对象
    loop = asyncio.get_event_loop()
    #将任务对象注册到该对象中并且启动事件循环
    loop.run_until_complete(task_A)

    任务对象绑定回调

    import asyncio
    import time
    
    async def request(url):
        print('正在请求:',url)
        time.sleep(2)
        print('请求完毕!',url)
    
        return url
    
    
    #定义一个任务对象的回调函数
    #task参数表示的就是该函数被绑定的那个任务对象
    def task_callback(task):
        print('i am task_callback()')
        print(task.result())
        #task.result()返回的就是任务对象对应的特殊函数内部的返回值
    
    c = request('www.xxx.com')
    
    task = asyncio.ensure_future(c)
    task.add_done_callback(task_callback)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)

    多任务异步协程

    import asyncio
    import time
    start = time.time()
    #在特殊函数内部不可以出现不支持异步模块相关的代码
    async def request(url):
        print('正在请求:',url)
        # time.sleep(2)#time模块是不支持异步
        await asyncio.sleep(2)  #阻塞操作必须使用await关键字进行挂起
        print('请求完毕!',url)
    
        return url
    
    urls = [
        'www.1.com',
        'www.2.com',
        'www.3.com'
    ]
    def task_callback(task):
        print(task.result())
    
    tasks = [] #多任务列表:存放多个任务对象
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(task_callback)
        tasks.append(task)  #将多个任务对象装在到一个任务列表中
    
    loop = asyncio.get_event_loop()
    #多任务注册
    #wait就是将任务列表中的任务对象进行挂起
    loop.run_until_complete(asyncio.wait(tasks))
    
    print(time.time()-start)

    多任务异步爬虫

    import asyncio
    import time
    import requests
    start = time.time()
    #在特殊函数内部不可以出现不支持异步模块相关的代码
    async def request(url):
       print('正在请求:',url)
       response = requests.get(url)
       return response.text
    
    urls = [
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay'
    ]
    
    def parse(task):
        page_text = task.result()
        print(page_text+',请求到的数据!!!')
    
    tasks = []
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse)
        tasks.append(task)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    
    
    print(time.time()-start)

    aiohttp使用

    # import asyncio
    # import time
    # import aiohttp
    # start = time.time()
    在特殊函数内部不可以出现不支持异步模块相关的代码
    简单的基本架构:
    async def request(url):
       with aiohttp.ClientSession() as s:
           #s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
           #在s.get中如果使用代理操作:proxy="http://ip:port"
           with s.get(url) as response:
               #获取字符串形式的响应数据:response.text()
               #获取byte类型的:response.read()
               page_text = response.text()
               return page_text
    在当前架构的基础上补充细节即可
        细节1:在每一个with前加上async关键字
        细节2:在get方法前和response.text()前加上await关键字进行手动挂起操作
    # async def request(url):
       # async with aiohttp.ClientSession() as s:
           s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
           在s.get中如果使用代理操作:proxy="http://ip:port"
           # async with await s.get(url) as response:
               获取字符串形式的响应数据:response.text()
               获取byte类型的:response.read()
               # page_text = await response.text()
               # return page_text
    
    urls = [
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
    ]
    # urls = []
    # for i in range(500):
        # urls.append('http://127.0.0.1:5000/bobo')
    # def parse(task):
        # page_text = task.result()
        # print(page_text+',请求到的数据!!!')
    
    # tasks = []
    # for url in urls:
        # c = request(url)
        # task = asyncio.ensure_future(c)
        # task.add_done_callback(parse)
        # tasks.append(task)
    
    # loop = asyncio.get_event_loop()
    # loop.run_until_complete(asyncio.wait(tasks))
    # print(time.time()-start)

    案列

    import aiohttp
    import asyncio
    from lxml import etree
    
    all_titles = []
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    
    }
    async def request(url):
        async with aiohttp.ClientSession() as s:
            async with await s.get(url,headers=headers) as response:
                page_text = await response.text()
                return page_text
    
    urls = []
    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d'
    for page in range(100):
        u_page = page * 30
        new_url = format(url%u_page)
        urls.append(new_url)
    
    tasks = []
    def parse(task):
        page_text = task.result()
        page_text = page_text.encode('gb2312').decode('gbk')
        tree = etree.HTML(page_text)
        tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
        for tr in tr_list:
            title = tr.xpath('./td[2]/a[2]/text()')[0]
            print(title)
            all_titles.append(title)
    
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse)
        tasks.append(task)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
  • 相关阅读:
    MVC模式-----struts2框架(2)
    MVC模式-----struts2框架
    html的<h>标签
    jsp脚本元素
    LeetCode "Paint House"
    LeetCode "Longest Substring with At Most Two Distinct Characters"
    LeetCode "Graph Valid Tree"
    LeetCode "Shortest Word Distance"
    LeetCode "Verify Preorder Sequence in Binary Search Tree"
    LeetCode "Binary Tree Upside Down"
  • 原文地址:https://www.cnblogs.com/songzhixue/p/11311705.html
Copyright © 2011-2022 走看看