zoukankan      html  css  js  c++  java
  • 谁在使用GPU?

    nvidia-smi命令可以查看GPU使用情况,但是只能看到占用每个GPU的进程ID。根据进程ID可以得到进程详情,进程详情中包括用户ID,根据用户ID可以获取用户名称,从而知道哪个用户在使用GPU。

    import json
    import os
    import re
    import sys
    import time
    import typing
    
    import bidict
    
    """
    查看谁在使用GPU
    """
    
    
    def get_user_id_map() -> typing.Dict[str:str]:
        """获取用户名和用户ID的对应关系"""
        home = os.path.expanduser('~')
        users = bidict.bidict()
        for user_name in os.listdir(os.path.join(home, '..')):
            info = os.popen('id ' + user_name + ' 2>&1').read().strip()
            if 'no such user' in info: continue
            try:
                a = re.search("uid=(\d+)((\w+))", info)
                users[a.group(1)] = a.group(2)  # userid==>username
            except Exception as e:
                print(e)
        return users
    
    
    def nvidia_smi() -> (int, typing.Dict[str:str]):
        """使用nvidia-smi命令查看GPU使用情况,返回GPU个数和各个GPU的进程的描述line"""
        info = os.popen('nvidia-smi').read()
        info = info.split('
    ')
        """
        smi信息分成上下两部分
        上面部分:以表格形式展示各个GPU的使用率
        下面部分:展示各个GPU上运行的进程ID 
        """
        space_ind = 0
        for ind, line in enumerate(info):
            if not line.strip():
                space_ind = ind
                break
    
        first_line = 0
        for ind, line in enumerate(info):
            if line.startswith('|===='):
                first_line = ind
                break
    
        gpu_count = abs(space_ind - first_line) // 3
        pos = None
        for ind, line in enumerate(info):
            line = line.split()
            if len(line) > 1 and line[1] == 'Processes:':
                pos = ind + 2
                break
        gpu_usage = dict()
        if pos == None:
            return gpu_count, gpu_usage
        for i in range(pos, len(info)):
            line = info[i].split()
            if len(line) > 1:
                thread = line[2]
                gpu_id = int(line[1])
                if gpu_id not in gpu_usage:
                    gpu_usage[gpu_id] = []
                gpu_usage[gpu_id].append(thread)
        return gpu_count, gpu_usage
    
    
    def get_thread_info(thread_id: str):
        """根据thread_id获取thread详细信息"""
        id2user = get_user_id_map()
        thread_info = os.popen('ps -l ' + thread_id).read().split('
    ')[1].split()
        thread_user = id2user.get(thread_info[2])
        thread_time = re.search('\d+', thread_info[12]).group()
        thread_cmd = ' '.join(thread_info[13:])
        return dict(user=thread_user, use_time="{} hours".format(float(thread_time) / 60), thread_id=thread_id, cmd=thread_cmd)
    
    
    def grep_gpu(task):
        """抢占GPU准备执行某个任务"""
        free_gpu = None
        while free_gpu is None:
            gpu_count, usage = nvidia_smi()
            time.sleep(2)
            for i in range(gpu_count):
                if i not in usage:
                    free_gpu = i
                    break
        print('free gpu found ! ', free_gpu)
        os.system(task)
    
    
    def show():
        gpu_count, usage = nvidia_smi()
        for gpu_id in usage:
            usage[gpu_id] = [get_thread_info(thread_id) for thread_id in usage[gpu_id]]
        print('gpu count', gpu_count)
        print(json.dumps(usage, ensure_ascii=0, indent=2))
    
    
    def run(gpu_id, task):
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        os.system('echo CUDA_VISIBLE_DEVICES:$CUDA_VISIBLE_DEVICES')
        os.system(task)
    
    
    if __name__ == '__main__':
        print(sys.argv)
        if len(sys.argv) == 1:
            print("""
            GPU utility
            
            gpu show
            gpu grep your command here
            gpu 1 python haha.py
            """)
            exit(0)
        action = sys.argv[1]
        if action == 'show':  # 显示GPU使用情况
            show()
        elif action == 'grep':  # 争夺GPU,得到之后执行命令
            cmd = ' '.join(sys.argv[2:])
            print('grep gpu and run', cmd)
            grep_gpu(cmd)
        elif re.match("\d+", action):  # 使用gpu_id执行某个action
            gpu_id = int(action)
            cmd = ' '.join(sys.argv[2:])
            print('run on gpu', gpu_id, 'cmd', cmd)
            run(gpu_id, cmd)
        else:
            print("unkown command")
    
    
  • 相关阅读:
    DataTable 导出到Excel
    asp.net 连接新浪微博
    ASP.NET中的HTTP模块和处理程序
    asp.net 前台获得url参数的最简单方法
    将Excel导入到DataTable (用ODBC方法连接)
    下拉框控件dhtmlXCombo在ASP.NET中的使用详解
    iis站点 asp.net网站访问弹出提示框
    习惯的力量
    1.面向对象设计模式与原则
    5. Factory Method 工厂方法(创建型模式)
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/11087363.html
Copyright © 2011-2022 走看看