zoukankan      html  css  js  c++  java
  • 爬虫之单线程多任务异步抓取

    协程

    import asyncio
    import time
    #定义了一个特殊的函数
    #特殊:调用后会返回一个协程对象,且函数内部的实现语句不会被立即执行
    #创建一个协程对象
    # async def test(num):
    #     print(num)
    #
    # c = test(10)
    # print(c)
    
    
    #封装一个任务对象
    # async def test(num):
    #     print(num)
    #
    # c = test(10)
    # #根据协程对象封装了一个任务对象
    # task = asyncio.ensure_future(c)
    # print(task)
    
    #事件循环对象
    async def request(url):
        print('正在请求:',url)
        time.sleep(2)
        print('请求完毕!',url)
    
    
    c1 = request('www.1.com')
    
    task_A = asyncio.ensure_future(c1)
    
    
    #创建一个事件循环对象
    loop = asyncio.get_event_loop()
    #将任务对象注册到该对象中并且启动事件循环
    loop.run_until_complete(task_A)

    任务对象绑定回调

    import asyncio
    import time
    
    async def request(url):
        print('正在请求:',url)
        time.sleep(2)
        print('请求完毕!',url)
    
        return url
    
    
    #定义一个任务对象的回调函数
    #task参数表示的就是该函数被绑定的那个任务对象
    def task_callback(task):
        print('i am task_callback()')
        print(task.result())
        #task.result()返回的就是任务对象对应的特殊函数内部的返回值
    
    c = request('www.xxx.com')
    
    task = asyncio.ensure_future(c)
    task.add_done_callback(task_callback)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(task)

    多任务异步协程

    import asyncio
    import time
    start = time.time()
    #在特殊函数内部不可以出现不支持异步模块相关的代码
    async def request(url):
        print('正在请求:',url)
        # time.sleep(2)#time模块是不支持异步
        await asyncio.sleep(2)  #阻塞操作必须使用await关键字进行挂起
        print('请求完毕!',url)
    
        return url
    
    urls = [
        'www.1.com',
        'www.2.com',
        'www.3.com'
    ]
    def task_callback(task):
        print(task.result())
    
    tasks = [] #多任务列表:存放多个任务对象
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(task_callback)
        tasks.append(task)  #将多个任务对象装在到一个任务列表中
    
    loop = asyncio.get_event_loop()
    #多任务注册
    #wait就是将任务列表中的任务对象进行挂起
    loop.run_until_complete(asyncio.wait(tasks))
    
    print(time.time()-start)

    多任务异步爬虫

    import asyncio
    import time
    import requests
    start = time.time()
    #在特殊函数内部不可以出现不支持异步模块相关的代码
    async def request(url):
       print('正在请求:',url)
       response = requests.get(url)
       return response.text
    
    urls = [
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay'
    ]
    
    def parse(task):
        page_text = task.result()
        print(page_text+',请求到的数据!!!')
    
    tasks = []
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse)
        tasks.append(task)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
    
    
    print(time.time()-start)

    aiohttp使用

    # import asyncio
    # import time
    # import aiohttp
    # start = time.time()
    在特殊函数内部不可以出现不支持异步模块相关的代码
    简单的基本架构:
    async def request(url):
       with aiohttp.ClientSession() as s:
           #s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
           #在s.get中如果使用代理操作:proxy="http://ip:port"
           with s.get(url) as response:
               #获取字符串形式的响应数据:response.text()
               #获取byte类型的:response.read()
               page_text = response.text()
               return page_text
    在当前架构的基础上补充细节即可
        细节1:在每一个with前加上async关键字
        细节2:在get方法前和response.text()前加上await关键字进行手动挂起操作
    # async def request(url):
       # async with aiohttp.ClientSession() as s:
           s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
           在s.get中如果使用代理操作:proxy="http://ip:port"
           # async with await s.get(url) as response:
               获取字符串形式的响应数据:response.text()
               获取byte类型的:response.read()
               # page_text = await response.text()
               # return page_text
    
    urls = [
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
        'http://127.0.0.1:5000/bobo',
        'http://127.0.0.1:5000/tom',
        'http://127.0.0.1:5000/jay',
    ]
    # urls = []
    # for i in range(500):
        # urls.append('http://127.0.0.1:5000/bobo')
    # def parse(task):
        # page_text = task.result()
        # print(page_text+',请求到的数据!!!')
    
    # tasks = []
    # for url in urls:
        # c = request(url)
        # task = asyncio.ensure_future(c)
        # task.add_done_callback(parse)
        # tasks.append(task)
    
    # loop = asyncio.get_event_loop()
    # loop.run_until_complete(asyncio.wait(tasks))
    # print(time.time()-start)

    案列

    import aiohttp
    import asyncio
    from lxml import etree
    
    all_titles = []
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    
    }
    async def request(url):
        async with aiohttp.ClientSession() as s:
            async with await s.get(url,headers=headers) as response:
                page_text = await response.text()
                return page_text
    
    urls = []
    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d'
    for page in range(100):
        u_page = page * 30
        new_url = format(url%u_page)
        urls.append(new_url)
    
    tasks = []
    def parse(task):
        page_text = task.result()
        page_text = page_text.encode('gb2312').decode('gbk')
        tree = etree.HTML(page_text)
        tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
        for tr in tr_list:
            title = tr.xpath('./td[2]/a[2]/text()')[0]
            print(title)
            all_titles.append(title)
    
    for url in urls:
        c = request(url)
        task = asyncio.ensure_future(c)
        task.add_done_callback(parse)
        tasks.append(task)
    
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asyncio.wait(tasks))
  • 相关阅读:
    Bootstrap之Carousel问题
    IMG图片和文字同行显示
    divcss5布局
    使用PHP QR Code生成二维码
    mysql grant用户权限设置
    Linux下的压缩解压缩命令详解
    linux网站目录及Apache权限的设置
    lamp环境编译(apache2.4.7 php5.4.25 mysql 5.5.23)
    mysql开启远程访问
    lamp环境编译(实际通过)
  • 原文地址:https://www.cnblogs.com/songzhixue/p/11311705.html
Copyright © 2011-2022 走看看