zoukankan      html  css  js  c++  java
  • 多线程、多进程、协程、IO多路复用请求百度

    最近学习了多线程、多进程、协程以及IO多路复用,那么对于爬取数据来说,这几个方式哪个最快呢,今天就来稍微测试一下

    普通方式请求百度5次

    import socket
    import time
    import socks 
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        for item in search_list:
            blocking(item)
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:4.24秒

    多线程版本

    import socket
    import time
    import socks 
    from multiprocessing.pool import ThreadPool
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        #多线程
        pool = ThreadPool(5) #实例线程池,开启5个线程
        search_list = ['python','java','C++','Ruby','Go']
        for i in search_list:
            pool.apply_async(blocking,args=(i,)) # 提交任务到线程池
        pool.close() #线程池不再接收任务
        pool.join() #等待任务执行完
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.0秒

    多进程版本

    import socket
    import time
    import socks 
    from multiprocessing import Pool
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        #多进程
        pool = Pool(5)
        search_list = ['python','java','C++','Ruby','Go']
        for i in search_list:
            pool.apply_async(blocking,args=(i,))
        pool.close()
        pool.join()
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.52秒

    协程版本

    from gevent import monkey;monkey.patch_socket()
    import socket
    import time
    import socks 
    import gevent
    
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80) #设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    def blocking(wd):
        sock = socket.socket()
        sock.connect(('www.baidu.com',80)) # 连接百度
        request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(wd)) # 构造http请求头
        response = b''  # 用于接收数据
        sock.send(request.encode())  # 发送http请求
        chunk = sock.recv(1024)  # 一次接收1024字节数据
        while chunk:  # 循环接收数据,若没有数据了说明已接收完
            response += chunk  # 字符串拼接
            chunk = sock.recv(1024)
        # print(response.decode())
        return response
    
    def blocking_way():
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        tasks = [gevent.spawn(blocking,i) for i in search_list]
        gevent.joinall(tasks)
    
    if __name__ == '__main__':
        start_time = time.time()
        blocking_way()
        print('请求5次百度总耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度总耗时:1.02秒

    IO多路复用版本

    import socks
    import time
    import socket
    import selectors
    socks.set_default_proxy(socks.HTTP,addr='192.168.105.71',port=80)  # 设置socks代理
    socket.socket = socks.socksocket  # 把代理应用到socket
    
    selector = selectors.DefaultSelector()  # 事件选择器
    flag = True  # 事件循环的标志
    times = 5  # 用于计数,每请求一次百度,就减1,若为0,说明已请求5次,此时结束事件循环
    
    class Crawler():
        def __init__(self,wd):
            self.response = b'' # 用于接收数据
            self.wd = wd # 搜索内容
    
        def fetch(self):
            '''创建客户端套接字,连接百度,定义好如果连接成功应该调用什么函数'''
            client = socket.socket()
            client.setblocking(False)
            try:
                client.connect(('www.baidu.com',80))  #此处需要注册事件监控
            except BlockingIOError:
                pass
            selector.register(client,selectors.EVENT_WRITE,self.send_request)
    
        def send_request(self,client):
            '''连接成功后发送请求到百度,并注册事件:收到百度应答应该做什么'''
            selector.unregister(client) # 把原先监控的事件取消,方便后面监控其他事件
            request = 'GET {} HTTP/1.0
    Host:www.baidu.com
    
    '.format('/s?wd={}'.format(self.wd))  # 构造http请求头
            client.send(request.encode())
            selector.register(client,selectors.EVENT_READ,self.get_response) #此处注册事件,若百度响应,调用get_response
    
        def get_response(self,client):
            '''若有数据发过来,就接收,每次发数据过来,都会触发,所以不用while循环'''
            global flag
            global times
            data = client.recv(1024) # 每次接收的数据不超过1024字节,若大于1024,分批传输
            if data:
                self.response += data # 字符串拼接
            else:  # 数据接收完
                # print(self.response.decode())
                client.close()
                selector.unregister(client)
                times -= 1 # 每次请求的响应接收完后,计数器减一
                if times == 0: # 5次请求完后,结束事件监控循环
                    flag = False
    def loop():
        '''事件监控循环'''
        while flag:
            events = selector.select()
            for key,mask in events:
                callback = key.data
                callback(key.fileobj)
    
    if __name__ == '__main__':
        start_time = time.time()
        search_list = ['python', 'java', 'C++', 'Ruby', 'Go']
        for item in search_list:
            crawler = Crawler(item)
            crawler.fetch()
        loop()
        print('请求5次百度耗时:{}'.format(round(time.time()-start_time,2)))

    多次执行结果:

    请求5次百度耗时:1.17

    大家可以把请求数调多一些多试几次!

    基本上协程和多线程耗时较短,更适用于爬虫。

  • 相关阅读:
    HDU 4472 Count DP题
    HDU 1878 欧拉回路 图论
    CSUST 1503 ZZ买衣服
    HDU 2085 核反应堆
    HDU 1029 Ignatius and the Princess IV
    UVa 11462 Age Sort
    UVa 11384
    UVa 11210
    LA 3401
    解决学一会儿累了的问题
  • 原文地址:https://www.cnblogs.com/woaixuexi9999/p/9367020.html
Copyright © 2011-2022 走看看