zoukankan      html  css  js  c++  java
  • 多线程、多进程、协程、IO多路复用请求百度

    最近学习了多线程、多进程、协程以及IO多路复用,那么对于爬取数据来说,这几个方式哪个最快呢,今天就来稍微测试一下

    普通方式请求百度5次

    import socket
    import time
    import socks 
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        for item in search_list:
            blocking(item)
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:4.24秒

    多线程版本

    import socket
    import time
    import socks 
    from multiprocessing.pool import ThreadPool
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        #多线程
        pool = ThreadPool(5) #实例线程池,开启5个线程
        search_list = ['python','java','C++','Ruby','Go']
        for i in search_list:
            pool.apply_async(blocking,args=(i,)) # 提交任务到线程池
        pool.close() #线程池不再接收任务
        pool.join() #等待任务执行完
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.0秒

    多进程版本

    import socket
    import time
    import socks 
    from multiprocessing import Pool
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        #多进程
        pool = Pool(5)
        search_list = ['python','java','C++','Ruby','Go']
        for i in search_list:
            pool.apply_async(blocking,args=(i,))
        pool.close()
        pool.join()
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.52秒

    协程版本

    from gevent import monkey;monkey.patch_socket()
    import socket
    import time
    import socks 
    import gevent
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        tasks = [gevent.spawn(blocking,i) for i in search_list]
        gevent.joinall(tasks)
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.02秒

    IO多路复用版本

    import socks
    import time
    import socket
    import selectors
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80)  # 设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    
    selector = selectors.DefaultSelector()  # 事件选择器
    flag = True  # 事件循环的标志
    times = 5  # 用于计数,每请求一次百度,就减1,若为0,说明已请求5次,此时结束事件循环
    
    class Crawler():
        def __init__(self,wd):
            self.response = b'' # 用于接收数据
            self.wd = wd # 搜索内容
    
        def fetch(self):
            '''创建客户端套接字,连接百度,定义好如果连接成功应该调用什么函数'''
            client = socket.socket()
            client.setblocking(False)
            try:
                client.connect(('www.baidu.com',80))  #此处需要注册事件监控
            except BlockingIOError:
                pass
            selector.register(client,selectors.EVENT_WRITE,self.send_request)
    
        def send_request(self,client):
            '''连接成功后发送请求到百度,并注册事件:收到百度应答应该做什么'''
            selector.unregister(client) # 把原先监控的事件取消,方便后面监控其他事件
            request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(self.wd))  # 构造http请求头
            client.send(request.encode())
            selector.register(client,selectors.EVENT_READ,self.get_response) #此处注册事件,若百度响应,调用get_response
    
        def get_response(self,client):
            '''若有数据发过来,就接收,每次发数据过来,都会触发,所以不用while循环'''
            global flag
            global times
            data = client.recv(1024) # 每次接收的数据不超过1024字节,若大于1024,分批传输
            if data:
                self.response += data # 字符串拼接
            else:  # 数据接收完
                # print(self.response.decode())
                client.close()
                selector.unregister(client)
                times -= 1 # 每次请求的响应接收完后,计数器减一
                if times == 0: # 5次请求完后,结束事件监控循环
                    flag = False
    def loop():
        '''事件监控循环'''
        while flag:
            events = selector.select()
            for key,mask in events:
                callback = key.data
                callback(key.fileobj)
    
    if __name__ == '__main__':
        start_time = time.time()
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        for item in search_list:
            crawler = Crawler(item)
            crawler.fetch()
        loop()
        print('请求5次百度耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度耗时:1.17

    大家可以把请求数调多一些多试几次!

    基本上协程和多线程耗时较短,更适用于爬虫。

  • 相关阅读:
    第一周。。。
    新人日报1129
    Daily Report-1126
    How to read source code[repost]
    Markdown tutorial [repost]
    蘑菇街面经
    阿里面经
    百度凤巢一二面经
    Mybatis最入门---代码自动生成(generatorConfig.xml配置)
    Maven的生命周期阶段
  • 原文地址:https://www.cnblogs.com/woaixuexi9999/p/9367020.html
Copyright © 2011-2022 走看看