zoukankan      html  css  js  c++  java
  • python-自定义异步非阻塞爬虫框架

    api

    import socket
    import select
    
    
    class MySock:
        def __init__(self, sock, data):
            self.sock = sock
            self.data = data
    
        def __getattr__(self, item):
            return getattr(self.sock, item)
    
    
    class YinBing:
        def __init__(self):
            self.r_list = []
            self.w_list = []
    
        def add(self, req_info):
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            sock.setblocking(0)
            try:
                sock.connect((req_info.get("host"), req_info.get("port")))
            except BlockingIOError:
                pass
            s = MySock(sock, req_info)
            self.r_list.append(s)
            self.w_list.append(s)
    
        def run(self):
            while True:
                rl, wl, el = select.select(self.r_list, self.w_list, [], 0.5)
                for sock in wl:
                    sock.send(("GET %s Http/1.1
    Host:%s
    
    " % (sock.data["path"], sock.data["host"])).encode("utf-8"))
                for sock in rl:
                    response = sock.recv(1024)
                    callback = sock.data.get("callback")
                    if callback: callback(response)
                    self.r_list.remove(sock)
    
                if not self.r_list:
                    break
    

    调用方法

    from 自定义爬虫框架 import YinBing
    
    
    def done1(response):
        print('处理一', response)
    
    
    def done2(response):
        print(response)
    
    
    url_list = [
        {'host': 'www.baidu.com', 'port': 80, 'path': '/', 'callback': done1},
        {'host': 'www.cnblogs.com', 'port': 80, 'path': '/index.html', 'callback': done2},
        {'host': 'www.bing.com', 'port': 80, 'path': '/', 'callback': None},
    ]
    
    if __name__ == '__main__':
        y = YinBing()
        for obj in url_list:
            y.add(obj)
        y.run()
  • 相关阅读:
    jekyll简单使用
    三、ansible简要使用
    四、ansible主机组定义
    项目中远程连接404 NOT FOUND问题的原因以及解决办法(这里只涉及我遇到的问题)
    AS3中的位操作
    AS3中is和as操作符的区别
    static 函数和普通函数的区别
    [译] SystemTap
    2017-09-17 python 学习笔记
    xargs 命令使用小记
  • 原文地址:https://www.cnblogs.com/LTEF/p/9820955.html
Copyright © 2011-2022 走看看