zoukankan      html  css  js  c++  java
  • python-自定义异步非阻塞爬虫框架

    api

    import socket
    import select
    
    
    class MySock:
        def __init__(self, sock, data):
            self.sock = sock
            self.data = data
    
        def __getattr__(self, item):
            return getattr(self.sock, item)
    
    
    class YinBing:
        def __init__(self):
            self.r_list = []
            self.w_list = []
    
        def add(self, req_info):
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            sock.setblocking(0)
            try:
                sock.connect((req_info.get("host"), req_info.get("port")))
            except BlockingIOError:
                pass
            s = MySock(sock, req_info)
            self.r_list.append(s)
            self.w_list.append(s)
    
        def run(self):
            while True:
                rl, wl, el = select.select(self.r_list, self.w_list, [], 0.5)
                for sock in wl:
                    sock.send(("GET %s Http/1.1
    Host:%s
    
    " % (sock.data["path"], sock.data["host"])).encode("utf-8"))
                for sock in rl:
                    response = sock.recv(1024)
                    callback = sock.data.get("callback")
                    if callback: callback(response)
                    self.r_list.remove(sock)
    
                if not self.r_list:
                    break
    

    调用方法

    from 自定义爬虫框架 import YinBing
    
    
    def done1(response):
        print('处理一', response)
    
    
    def done2(response):
        print(response)
    
    
    url_list = [
        {'host': 'www.baidu.com', 'port': 80, 'path': '/', 'callback': done1},
        {'host': 'www.cnblogs.com', 'port': 80, 'path': '/index.html', 'callback': done2},
        {'host': 'www.bing.com', 'port': 80, 'path': '/', 'callback': None},
    ]
    
    if __name__ == '__main__':
        y = YinBing()
        for obj in url_list:
            y.add(obj)
        y.run()
  • 相关阅读:
    spring cglib final @Transactional
    【转】电商架构
    logback发邮件配置
    @Reference不支持继承
    jmap jstack
    dubbo线程池
    C# 爬虫框架实现 流程_爬虫结构/原理
    C# 爬虫框架实现 流程_各个类开发
    C# 爬虫框架实现 概述
    作用域 作用域链 闭包 思想 JS/C++比较
  • 原文地址:https://www.cnblogs.com/LTEF/p/9820955.html
Copyright © 2011-2022 走看看