zoukankan      html  css  js  c++  java
  • 自定义异步IO爬虫

    """
    ##########浏览器的本质#############
    sk=socket.socket(socket.AF_INET,socket.SOCK_STREAM)
    #连接 IO阻塞 sk.connect(('www.baidu.com',80)) print('连接成功') #连接成功后发送消息 GET请求 sk.send(b'GET / HTTP/1.0 Host: www.baidu.com ') #POST请求发送数据 # sk.send(b'POST / HTTP/1.0 Host: www.baidu.com k1=v1&k2=v2') #等待服务器响应 IO阻塞 data=sk.recv(8096) print(data) sk.close()
    """
    #select 实际监听对象的fileno()方法
    #所以对象必须有: fileno()方法,并返回一个文件描述符
    
    #IO多路复用;实际监听多个socket对象
    #异步:-非阻塞的socket+IO多路复用
    import socket
    import select
    
    
    class SocketResponce:
        def __init__(self,data):
            self.data=data
            self.header_dict={}
            self.body=None
            self.iniitialize()
    
        def iniitialize(self):
            headers,body=self.data.split(b'
    
    ',1)
            self.body=body
            headers_list=headers.split(b'
    ')
            for h in headers_list:
                h_str=str(h,encoding='utf-8')
                v=h_str.split(':',1)
                if len(v)==2:
                    self.header_dict[v[0]]=v[1]
    
    
    class SocketRequest:
    
        def __init__(self,sk,host,callback):
            self.socket=sk
            self.host=host
            self.callback=callback
    
        def fileno(self):
            return self.socket.fileno()
    
    
    class AsyncioRequest:
    
        def __init__(self):
            self.conn=[]
            self.connection=[] #用于检测是否已经连接成功
    
        def add_request(self,host,callback):
            try:
                sk=socket.socket()
                sk.setblocking(False)
                sk.connect((host,80))
    
            except Exception as e:
                pass
            socketreq=SocketRequest(sk,host,callback)
            self.conn.append(socketreq)
            self.connection.append(socketreq)
    
        def run(self):
            while True:
                rlist,wlist,elist=select.select(self.conn,self.connection,self.connection)
                for w in wlist:
                    #只要能循环到,表示socket和服务器建立连接成功
                    tcp='GET / HTTP/1.0
    Host: %s
    
    ' %(w.host,)
                    w.socket.send(bytes(tcp,encoding='utf-8'))
                    self.connection.remove(w)
                data_bytes = bytes()
                for r in rlist:
                    while True:
                       try:
                           data=r.socket.recv(8096)
                           data_bytes+=data
                           if len(data)==0:
                               break
                       except Exception as e:
                           break
                    response=SocketResponce(data_bytes)
                    r.callback(response)
                    r.socket.close()
                    self.conn.remove(r)
    
                if len(self.conn)==0:
                    break
    
    
    def recv_data(responce):
        print(responce.body)
        print(responce.header_dict)
    
    
    url_list=[
        {'host':'www.baidu.com','callback':recv_data},
        {'host':'www.cnblogs.com','callback':recv_data}
    ]
    
    request=AsyncioRequest()
    for url in url_list:
        request.add_request(url['host'],url['callback'])
    
    request.run()
  • 相关阅读:
    java设计模式之单例模式总结
    分页功能实现
    java设计模式之代理模式模式总结
    java设计模式之策略模式总结
    快速排序解决相关问题
    单例模式之恶汉模式(详解)
    java多线程之内存的可见性介绍(备用1)
    Robotframework(4):创建变量的类型和使用
    Robotframework(3):使用pycharm编写和运行RF脚本
    Robotframework(2):创建RF第一条可执行的用例
  • 原文地址:https://www.cnblogs.com/lujiacheng-Python/p/10256073.html
Copyright © 2011-2022 走看看