zoukankan      html  css  js  c++  java
  • NVML查询显卡信息

            前段时间做英伟达硬解得时候,显卡总是莫名挂掉,后来发现是因为显卡温度过高掉了。这几天找到CUDA中有NVML工具可以查看显卡信息,nvidia-smi也是基于这个工具包。

            使用的CUDA版本为CUDA 8.0 。

    1.给程序添加NVML

            安装CUDA之后可以找到如下:

    image

    图1.NVML的例子

    这里面包含的是NVML的一个例子。我的系统是64位的,可以找到NVML的lib和头文件如下:

    image

    图2.NVML的lib文件

    image

    图3.NVML头文件

    在工程中包含NVML。我是新建的CUDA 8.0 Runtime工程,因为NVML包含在CUDA中,建CUDA 8.0 Runtime工程可以省去CUDA的配置工作,工程建立方法参见VS2013 VC++的.cpp文件调用CUDA的.cu文件中的函数

    ,CUDA 8.0为默认安装,系统为win10 64位。

            在程序中直接包含NVML的头文件和lib文件即可:

    #include "nvml.h"
    
    #pragma  comment(lib,"nvml.lib")

    注意64位系统应该建立x64工程,因为在安装的CUDA中没有win32的nvml.lib。

    2.NVML查询显卡信息

            常用函数:

            ·nvmlInit()函数初始化NVML;

            ·nvmlDeviceGetCount(unsigned int *deviceCount)函数可以获得显卡数;

            ·nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device)获取设备;

            ·nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length)查询设备的名称;

            ·nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci)获取PCI信息,对这个函数的重要性,例子中是这么说的

                // pci.busId is very useful to know which device physically you're talking to
                // Using PCI identifier you can also match nvmlDevice handle to CUDA device.

            ·nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode)得到显卡当前所处的模式,模式由以下:

    typedef enum nvmlComputeMode_enum
    {
        NVML_COMPUTEMODE_DEFAULT           = 0,  //!< Default compute mode -- multiple contexts per device
        NVML_COMPUTEMODE_EXCLUSIVE_THREAD  = 1,  //!< Support Removed
        NVML_COMPUTEMODE_PROHIBITED        = 2,  //!< Compute-prohibited mode -- no contexts per device
        NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3,  //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
       
        // Keep this last
        NVML_COMPUTEMODE_COUNT
    } nvmlComputeMode_t;

            ·nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode)可以修改显卡的模式;

            ·nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp)查询温度阈值,具体有两种:

    typedef enum nvmlTemperatureThresholds_enum
    {
        NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0,    // Temperature at which the GPU will shut down for HW protection
        NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1,    // Temperature at which the GPU will begin slowdown
        // Keep this last
        NVML_TEMPERATURE_THRESHOLD_COUNT
    } nvmlTemperatureThresholds_t;

    当温度达到NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 参数获取的温度时,显卡将自动关闭以保护硬件;当温度达到NVML_TEMPERATURE_THRESHOLD_SLOWDOWN参数获取的温度时,显卡的性能将下降。

            ·nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp)获取显卡当前温度;

            ·nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization)获取设备的使用率(原注释:Retrieves the current utilization rates for the device's major subsystems。不知道理解错了没有),使用率包括以下:

    typedef struct nvmlUtilization_st
    {
        unsigned int gpu;                //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
        unsigned int memory;             //!< Percent of time over the past sample period during which global (device) memory was being read or written
    } nvmlUtilization_t;

            ·nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory)    Retrieves the amount of used, free and total memory available on the device, in bytes。

            ·nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory)   Gets Total, Available and Used size of BAR1 memory.(不知道这种与上一种有什么区别,有待后续学习)

           ·nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos)    Get information about processes with a compute context on a device。应该是获取当前在使用显卡的程序信息。

            ·nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock)   Retrieves the maximum clock speeds for the device。包括以下:

    typedef enum nvmlClockType_enum
    {
        NVML_CLOCK_GRAPHICS  = 0,        //!< Graphics clock domain
        NVML_CLOCK_SM        = 1,        //!< SM clock domain
        NVML_CLOCK_MEM       = 2,        //!< Memory clock domain
        NVML_CLOCK_VIDEO     = 3,        //!< Video encoder/decoder clock domain
       
        // Keep this last
        NVML_CLOCK_COUNT //<! Count of clock types
    } nvmlClockType_t;

            ·nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock)   Retrieves the current clock speeds for the device.上面是获取最大的,这个是获取当前的。

    代码示例:

    #include "cuda_kernels.h"
    
    #include "nvml.h"
    
    #include <stdio.h>  
    #include <windows.h>  
    #include <winbase.h>  
    #include <tlhelp32.h>  
    #include <psapi.h>   #pragma comment(lib,"kernel32.lib")  
    #pragma comment(lib,"advapi32.lib")  
    
    #pragma  comment(lib,"nvml.lib")
    
    const char * convertToComputeModeString(nvmlComputeMode_t mode)
    {
        switch (mode)
        {
        case NVML_COMPUTEMODE_DEFAULT:
            return "Default";
        case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
            return "Exclusive_Thread";
        case NVML_COMPUTEMODE_PROHIBITED:
            return "Prohibited";
        case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
            return "Exclusive Process";
        default:
            return "Unknown";
        }
    }
    
    int main()
    {
        cuAdd();
    
        nvmlReturn_t result;
        unsigned int device_count, i;
    
        // First initialize NVML library
        result = nvmlInit();
        if (NVML_SUCCESS != result)
        {
            printf("Failed to initialize NVML: %s
    ", nvmlErrorString(result));
    
            printf("Press ENTER to continue...
    ");
            getchar();
            return 1;
        }
    
        result = nvmlDeviceGetCount(&device_count);
        if (NVML_SUCCESS != result)
        {
            printf("Failed to query device count: %s
    ", nvmlErrorString(result));
            goto Error;
        }
        printf("Found %d device%s
    
    ", device_count, device_count != 1 ? "s" : "");
    
        printf("Listing devices:
    ");
        while (true)
        {
            for (i = 0; i < device_count; i++)
            {
                nvmlDevice_t device;
                char name[NVML_DEVICE_NAME_BUFFER_SIZE];
                nvmlPciInfo_t pci;
                nvmlComputeMode_t compute_mode;
    
                // Query for device handle to perform operations on a device
                // You can also query device handle by other features like:
                // nvmlDeviceGetHandleBySerial
                // nvmlDeviceGetHandleByPciBusId
                result = nvmlDeviceGetHandleByIndex(i, &device);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get handle for device %i: %s
    ", i, nvmlErrorString(result));
                    goto Error;
                }
    
                result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get name of device %i: %s
    ", i, nvmlErrorString(result));
                    goto Error;
                }
    
                // pci.busId is very useful to know which device physically you're talking to
                // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
                result = nvmlDeviceGetPciInfo(device, &pci);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get pci info for device %i: %s
    ", i, nvmlErrorString(result));
                    goto Error;
                }
    
                printf("%d. %s [%s]
    ", i, name, pci.busId);
    
                // This is a simple example on how you can modify GPU's state
                result = nvmlDeviceGetComputeMode(device, &compute_mode);
                if (NVML_ERROR_NOT_SUPPORTED == result)
                    printf("	 This is not CUDA capable device
    ");
                else if (NVML_SUCCESS != result)
                {
                    printf("Failed to get compute mode for device %i: %s
    ", i, nvmlErrorString(result));
                    goto Error;
                }
                else
                {
                    // try to change compute mode
                    printf("	 Changing device's compute mode from '%s' to '%s'
    ",
                        convertToComputeModeString(compute_mode),
                        convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));
    
                    result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
                    if (NVML_ERROR_NO_PERMISSION == result)
                        printf("		 Need root privileges to do that: %s
    ", nvmlErrorString(result));
                    else if (NVML_ERROR_NOT_SUPPORTED == result)
                        printf("		 Compute mode prohibited not supported. You might be running on
    "
                        "		 windows in WDDM driver model or on non-CUDA capable GPU.
    ");
                    else if (NVML_SUCCESS != result)
                    {
                        printf("		 Failed to set compute mode for device %i: %s
    ", i, nvmlErrorString(result));
                        goto Error;
                    }
                    else
                    {
                        printf("	 Restoring device's compute mode back to '%s'
    ",
                            convertToComputeModeString(compute_mode));
                        result = nvmlDeviceSetComputeMode(device, compute_mode);
                        if (NVML_SUCCESS != result)
                        {
                            printf("		 Failed to restore compute mode for device %i: %s
    ", i, nvmlErrorString(result));
                            goto Error;
                        }
                    }
                }
    
                printf("
    ");
                printf("----- 温度 ----- 
    ");
                unsigned int temperature_threshold = 100;
                result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temperature_threshold);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i Failed to get NVML_TEMPERATURE_THRESHOLD_SHUTDOWN: %s
    ", i, nvmlErrorString(result));
                }
                else
                    printf("截止温度: %d 摄氏度  (Temperature at which the GPU will shut down for HW protection)
    ", temperature_threshold);
    
                result = nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &temperature_threshold);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i Failed NVML_TEMPERATURE_THRESHOLD_SLOWDOWN: %s
    ", i, nvmlErrorString(result));
                }
                else
                    printf("上限温度: %d 摄氏度  (Temperature at which the GPU will begin slowdown)
    ", temperature_threshold);
    
                unsigned int temperature = 0;
                result = nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temperature);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i NVML_TEMPERATURE_GPU Failed: %s
    ", i, nvmlErrorString(result));
                }
                else
                    printf("当前温度: %d 摄氏度 
    ", temperature);
    
                //使用率
                printf("
    ");
                nvmlUtilization_t utilization;
                result = nvmlDeviceGetUtilizationRates(device, &utilization);
                if (NVML_SUCCESS != result)
                {
                    printf(" device %i nvmlDeviceGetUtilizationRates Failed : %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("----- 使用率 ----- 
    ");
                    printf("GPU 使用率: %lld %% 
    ", utilization.gpu);
                    printf("显存使用率: %lld %% 
    ", utilization.memory);
                }
    
                //FB memory
                printf("
    ");
                nvmlMemory_t memory;
                result = nvmlDeviceGetMemoryInfo(device, &memory);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i nvmlDeviceGetMemoryInfo Failed : %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("------ FB memory ------- 
    ");
                    printf("Total installed FB memory: %lld bytes 
    ", memory.total);
                    printf("Unallocated FB memory: %lld bytes 
    ", memory.free);
                    printf("Allocated FB memory: %lld bytes 
    ", memory.used);
                }
    
                //BAR1 memory
                printf("
    ");
                nvmlBAR1Memory_t bar1Memory;
                result = nvmlDeviceGetBAR1MemoryInfo(device, &bar1Memory);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i  nvmlDeviceGetBAR1MemoryInfo Failed : %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("------ BAR1 memory ------- 
    ");
                    printf("Total BAR1 memory: %lld bytes 
    ", bar1Memory.bar1Total);
                    printf("Unallocated BAR1 memory: %lld bytes 
    ", bar1Memory.bar1Free);
                    printf("Allocated BAR1 memory: %lld bytes 
    ", bar1Memory.bar1Used);
                }
    
                //Information about running compute processes on the GPU
                printf("
    ");
                unsigned int infoCount;
                nvmlProcessInfo_t infos[999];
                result = nvmlDeviceGetComputeRunningProcesses(device, &infoCount, infos);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get ComputeRunningProcesses for device %i: %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    HANDLE handle; //定义CreateToolhelp32Snapshot系统快照句柄       
                    handle = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);//获得系统快照句柄     
                    PROCESSENTRY32 *info; //定义PROCESSENTRY32结构字指     
                    //PROCESSENTRY32 结构的 dwSize 成员设置成 sizeof(PROCESSENTRY32)      
                    info = new PROCESSENTRY32;
                    info->dwSize = sizeof(PROCESSENTRY32);
                    //调用一次     Process32First 函数,从快照中获取进程列表     
                    Process32First(handle, info);
                    //重复调用 Process32Next,直到函数返回 FALSE 为止    
    
                    printf("------ Information about running compute processes on the GPU ------- 
    ");
                    for (int i = 0; i < infoCount; i++)
                    {
                        printf("PID: %d  显存占用:%lld bytes   ", infos[i].pid, infos[i].usedGpuMemory);
    
                        while (Process32Next(handle, info) != FALSE)
                        {
                            if (info->th32ProcessID == infos[i].pid)
                            {
                                //printf("  %s
    ", info->szExeFile);
    
                                HANDLE hProcess = NULL;
                                //打开目标进程  
                                hProcess = OpenProcess(PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, info->th32ProcessID);
                                if (hProcess == NULL) {
                                    printf("
    Open Process fAiled:%d
    ", GetLastError());
                                    break;
                                }
    
                                char strFilePath[MAX_PATH];
                                GetModuleFileNameEx(hProcess, NULL, strFilePath, MAX_PATH);
                                printf(" %s
    ", strFilePath);
    
                                CloseHandle(hProcess);
    
                                break;
                            }
                        }
                    }
                    
                    delete info;
                    CloseHandle(handle);
                }
    
                //BAR1 memory
                printf("
    ");
                printf("------ Clocks ------- 
    "); 
                unsigned int max_clock;
                result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_GRAPHICS, &max_clock);
                if (NVML_SUCCESS != result)
                {
                    printf("device %i   nvmlDeviceGetMaxClockInfo Failed : %s
    ", i, nvmlErrorString(result));
                }
    
                unsigned int clock;
                result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_GRAPHICS, &clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get NVML_CLOCK_GRAPHICS info for device %i: %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("GRAPHICS: %6d Mhz   max clock :%d  
    ", clock, max_clock);
                }
    
                result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &max_clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get max NVML_CLOCK_SM for device %i: %s
    ", i, nvmlErrorString(result));
                }
    
                result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get current NVML_CLOCK_SM for device %i: %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("      SM: %6d Mhz   max clock :%d   
    ", clock, max_clock);
                }
    
                result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_MEM, &max_clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get max NVML_CLOCK_MEM for device %i: %s
    ", i, nvmlErrorString(result));
                }
    
                result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get current NVML_CLOCK_MEM for device %i: %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("     MEM: %6d Mhz   max clock :%d   
    ", clock, max_clock);
                }
    
                result = nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_VIDEO, &max_clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get max NVML_CLOCK_VIDEO for device %i: %s
    ", i, nvmlErrorString(result));
                }
    
                result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_VIDEO, &clock);
                if (NVML_SUCCESS != result)
                {
                    printf("Failed to get current NVML_CLOCK_VIDEO for device %i: %s
    ", i, nvmlErrorString(result));
                }
                else
                {
                    printf("   VIDEO: %6d Mhz   max clock :%d   
    ", clock, max_clock);
                }
            }
    
            printf("-------------------------------------------------------------------- 
    ");
    
            Sleep(1000);
        }
    
    Error:
        result = nvmlShutdown();
        if (NVML_SUCCESS != result)
            printf("Failed to shutdown NVML: %s
    ", nvmlErrorString(result));
    
        system("pause");
    
        return 0;
    }

    虽然我已经把nvml.dll拷贝到运行目录,程序应该是可以正常运行了。也做一下nvidia-smi的环境配置,参考NVIDIA 显卡信息(CUDA信息的查看),我把他的复制到下面来:

    1. nvidia-smi 查看显卡信息

    nvidia-smi 指的是 NVIDIA System Management Interface;

    在安装完成 NVIDIA 显卡驱动之后,对于 windows 用户而言,cmd 命令行界面还无法识别 nvidia-smi 命令,需要将相关环境变量添加进去。如将 NVIDIA 显卡驱动安装在默认位置,nvidia-smi 命令所在的完整路径应当为:

    C:Program FilesNVIDIA CorporationNVSMI

    也即将上述路径添加进 Path 系统环境变量中。

    2. 查看 CUDA 信息

    • CUDA 的版本:
      • 进入命令行:nvcc -V

    3.运行结果

    image

    图4.GeForce 940M查询结果

    P4测试

    图5.Tesla P4查询结果

            NVML对GeForce 940M的支持不怎么好,对Tesla P4支持得比较好。


    工程源码:http://download.csdn.net/download/qq_33892166/9841800

  • 相关阅读:
    Windows 科研软件推荐
    有关Python 包 (package) 的基本知识
    《Using Python to Access Web Data》Week4 Programs that Surf the Web 课堂笔记
    Coursera助学金申请模板
    《Using Databases with Python》 Week2 Basic Structured Query Language 课堂笔记
    Jupyter 解决单个变量输出问题
    解决 pandas 中打印 DataFrame 行列显示不全的问题
    《Using Python to Access Web Data》 Week3 Networks and Sockets 课堂笔记
    缓存击穿及解决方案
    jvm垃圾收集器
  • 原文地址:https://www.cnblogs.com/betterwgo/p/6858806.html
Copyright © 2011-2022 走看看