zoukankan      html  css  js  c++  java
  • 爬虫(九)—— 爬虫高性能

    爬虫高性能

    一、并发爬取

    线程池或进程池+异步调用:提交一个任务后并不会等待任务结束,而是继续下一行代码

    import requests
    from threading import current_thread
    from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor
    
    def parse_page(res):
        res=res.result()      # 回调得到的是一个对象
        print('%s 解析 %s' %(current_thread().getName(),len(res)))
    
    def get_page(url):
        print('%s 下载 %s' %(current_thread().getName(),url))
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    
    if __name__ == '__main__':
        urls=['https://www.baidu.com/','http://www.sina.com.cn/','https://www.python.org']
    
        pool=ThreadPoolExecutor(50)
        # pool=ProcessPoolExecutor(50)
        for url in urls:
            pool.submit(get_page,url).add_done_callback(parse_page)
    
        pool.shutdown(wait=True)      # 等待所有线程任务结束,关闭池子
    
    

    二、高性能

    ​ 上述无论哪种解决方案其实没有解决一个性能相关的问题:IO阻塞,无论是多进程还是多线程,在遇到IO阻塞时都会被操作系统强行剥夺走CPU的执行权限,程序的执行效率因此就降低了下来。

    ​ 解决这一问题的关键在于,我们自己从应用程序级别检测IO阻塞然后切换到我们自己程序的其他任务执行,这样把我们程序的IO降到最低,我们的程序处于就绪态就会增多,以此来迷惑操作系统,操作系统便以为我们的程序是IO比较少的程序,从而会尽可能多的分配CPU给我们,这样也就达到了提升程序执行效率的目的

    1、监测IO—— asyncio模块(异步)

    import asyncio
    
    @asyncio.coroutine      # 能够识别,和协程中的monkey类似
    def task(task_id,senconds):
        print('%s is start' %task_id)
        yield from asyncio.sleep(senconds) 	# 只能检测网络IO,检测到IO后切换到其他任务执行
        print('%s is end' %task_id)
    
    tasks=[task(task_id="任务1",senconds=3),task("任务2",2),task(task_id="任务3",senconds=1)]
    
    loop=asyncio.get_event_loop()     # 循环,循环监测
    loop.run_until_complete(asyncio.wait(tasks))    # 等所有任务运行完
    loop.close()    # 循环break
    

    2、自定义http协议(太麻烦,不常用)

    asyncio模块只能发tcp级别的请求,不能发http协议,因此,在我们需要发送http请求的时候,需要我们自定义http报头

    import asyncio
    import requests
    import uuid
    user_agent='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
    
    def parse_page(host,res):
        print('%s 解析结果 %s' %(host,len(res)))
        with open('%s.html' %(uuid.uuid1()),'wb') as f:
            f.write(res)
    
    @asyncio.coroutine
    def get_page(host,port=80,url='/',callback=parse_page,ssl=False):
        print('下载 http://%s:%s%s' %(host,port,url))
    
        #步骤一(IO阻塞):发起tcp链接,是阻塞操作,因此需要yield from
        if ssl:
            port=443
        recv,send=yield from asyncio.open_connection(host=host,port=443,ssl=ssl)
    
        # 步骤二:封装http协议的报头,因为asyncio模块只能封装并发送tcp包,因此这一步需要我们自己封装http协议的包
        request_headers="""GET %s HTTP/1.0
    Host: %s
    User-agent: %s
    
    """ %(url,host,user_agent)
        # requset_headers="""POST %s HTTP/1.0
    Host: %s
    
    name=egon&password=123""" % (url, host,)
        request_headers=request_headers.encode('utf-8')
    
        # 步骤三(IO阻塞):发送http请求包
        send.write(request_headers)
        yield from send.drain()
    
        # 步骤四(IO阻塞):接收响应头
        while True:
            line=yield from recv.readline()
            if line == b'
    ':
                break
            print('%s Response headers:%s' %(host,line))
    
        # 步骤五(IO阻塞):接收响应体
        text=yield from recv.read()
    
        # 步骤六:执行回调函数
        callback(host,text)
    
        # 步骤七:关闭套接字
        send.close() #没有recv.close()方法,因为是四次挥手断链接,双向链接的两端,一端发完数据后执行send.close()另外一端就被动地断开
    
    
    if __name__ == '__main__':
        tasks=[
            get_page('www.baidu.com',url='/s?wd=美女',ssl=True),
            get_page('www.cnblogs.com',url='/',ssl=True),
        ]
    
        loop=asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
    

    3、封装http报头——aiohttp模块

    自定义http报头多少有点麻烦,于是有了aiohttp模块,专门帮我们封装http报头,然后我们还需要用asyncio检测IO实现切换

    import aiohttp
    import asyncio
    
    @asyncio.coroutine
    def get_page(url):
        print('GET:%s' %url)
        response=yield from aiohttp.request('GET',url)   # 封装制作请求头,发送请求头
    
        data=yield from response.read()    # 接收响应头,响应体
    
        print(url,data)
        response.close()
        return 1
    
    tasks=[
        get_page('https://www.python.org/doc'),
        get_page('https://www.cnblogs.com/linhaifeng'),
        get_page('https://www.openstack.org')
    ]
    
    loop=asyncio.get_event_loop()
    results=loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    
    print('=====>',results) # [1, 1, 1]
    

    4、asyncio+requests模块的方法

    import requests
    import asyncio
    
    @asyncio.coroutine
    def get_page(func,*args):
        print('GET:%s' %args[0])
        loog=asyncio.get_event_loop()
        furture=loop.run_in_executor(None,func,*args)
        response=yield from furture
    
        print(response.url,len(response.text))
        return 1
    
    tasks=[
        get_page(requests.get,'https://www.python.org/doc'),
        get_page(requests.get,'https://www.cnblogs.com/linhaifeng'),
        get_page(requests.get,'https://www.openstack.org')
    ]
    
    loop=asyncio.get_event_loop()
    results=loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    
    

    5、协程gevent——单线程并发

    from gevent import monkey;monkey.patch_all()
    import gevent
    import requests
    
    def get_page(url):
        print('GET:%s' %url)
        response=requests.get(url)
        print(url,len(response.text))
        return 1
    
    # 协程
    # g1=gevent.spawn(get_page,'https://www.python.org/doc')
    # g2=gevent.spawn(get_page,'https://www.cnblogs.com/linhaifeng')
    # g3=gevent.spawn(get_page,'https://www.openstack.org')
    # gevent.joinall([g1,g2,g3,])
    # print(g1.value,g2.value,g3.value) 	# 拿到返回值
    
    
    # 协程池
    from gevent.pool import Pool
    pool=Pool(2)
    g1=pool.spawn(get_page,'https://www.python.org/doc')
    g2=pool.spawn(get_page,'https://www.cnblogs.com/linhaifeng')
    g3=pool.spawn(get_page,'https://www.openstack.org')
    gevent.joinall([g1,g2,g3,])
    print(g1.value,g2.value,g3.value) 	# 拿到返回值
    

    6、gevent和requests ===> grequests模块

    #pip3 install grequests
    
    import grequests
    
    request_list=[
        grequests.get('https://wwww.xxxx.org/doc1'),
        grequests.get('https://www.cnblogs.com/linhaifeng'),
        grequests.get('https://www.openstack.org')
    ]
    
    
    ##### 执行并获取响应列表 #####
    # response_list = grequests.map(request_list)
    # print(response_list)
    
    ##### 执行并获取响应列表(处理异常) #####
    def exception_handler(request, exception):
        # print(request,exception)
        print("%s Request failed" %request.url)
    
    response_list = grequests.map(request_list, exception_handler=exception_handler)
    print(response_list)
    

    7、twisted框架

    twisted框架是一个网络框架,其中一个功能是发送异步请求检测IO并自动切换

    (1)安装

    # 1.问题一:error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
    https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
    pip3 install C:UsersAdministratorDownloadsTwisted-17.9.0-cp36-cp36m-win_amd64.whl
    pip3 install twisted
    
    # 2.问题二:ModuleNotFoundError: No module named 'win32api'
    https://sourceforge.net/projects/pywin32/files/pywin32/
    
    # 3.问题三:openssl
    pip3 install pyopenssl
    
    

    (2)twisted基本用法

    # twisted基本用法
    from twisted.web.client import getPage,defer
    from twisted.internet import reactor
    
    def all_done(arg):
        # print(arg)
        reactor.stop()     # 关闭循环
    
    def parse_page(res):
        print(res)
        return 1
    
    defer_list=[]
    urls=[
        'http://www.baidu.com',
        'http://www.bing.com',
        'https://www.python.org',
    ]
    for url in urls:
        obj=getPage(url.encode('utf=-8'),)
        obj.addCallback(parse_page)
        defer_list.append(obj)
    
    defer.DeferredList(defer_list).addBoth(all_done)   # 循环监测IO,切换任务
    
    reactor.run()    # 运行任务
    

    (3)twisted的getPage的详细用法

    # twisted的getPage的详细用法
    from twisted.internet import reactor
    from twisted.web.client import getPage
    import urllib.parse
    
    
    def one_done(arg):
        print(arg)
        reactor.stop()
    
    post_data = urllib.parse.urlencode({'check_data': 'adf'})
    post_data = bytes(post_data, encoding='utf8')
    headers = {b'Content-Type': b'application/x-www-form-urlencoded'}
    response = getPage(bytes('http://dig.chouti.com/login', encoding='utf8'),
                       method=bytes('POST', encoding='utf8'),
                       postdata=post_data,
                       cookies={},
                       headers=headers)
    response.addBoth(one_done)
    
    reactor.run()
    

    8、tornado框架——高并发、异步非阻塞IO

    高并发高性能,异步非阻塞IO,websocket长连接,内置高性能HTTP服务器

    from tornado.httpclient import AsyncHTTPClient
    from tornado.httpclient import HTTPRequest
    from tornado import ioloop
    
    count=0   # 所有任务都完毕后也不能正常结束,为了解决该问题,让我们来加上计数器
    
    def handle_response(response):
        """
        处理返回值内容(需要维护计数器,来停止IO循环),调用 ioloop.IOLoop.current().stop()
        :param response:
        :return:
        """
        if response.error:
            print("Error:", response.error)
        else:
            print(len(response.body))
    
        global count
        count-=1 #完成一次回调,计数减1
        if count == 0:
            ioloop.IOLoop.current().stop() 
    
    def func():
        url_list = [
            'http://www.baidu.com',
            'http://www.bing.com',
        ]
    
        global count
        for url in url_list:
            print(url)
            http_client = AsyncHTTPClient()
            http_client.fetch(HTTPRequest(url), handle_response)
            count+=1 #计数加1
    
    ioloop.IOLoop.current().add_callback(func)
    ioloop.IOLoop.current().start()
    
    
  • 相关阅读:
    将RIP协议配置成单播
    powershell的超级破烂的设置问题。
    netsh trace抓包结合microsoft network monitor 进行分析
    Managing Windows Programs from the Command Line: Tasklist
    windows 7 的个超级工具
    Metasploit开源安全漏洞检测工具
    Assigned Internet Protocol Numbers
    4B/5B编码原理
    PHP在IIS下。
    网络层的一些附属协议
  • 原文地址:https://www.cnblogs.com/linagcheng/p/10831457.html
Copyright © 2011-2022 走看看