zoukankan      html  css  js  c++  java
  • 1_Utilities__deviceQuery + 1_Utilities__deviceQueryDrv + 1_Utilities__topologyQuery

    使用 Runtime API 和 Driver API 检测设备相关属性。并检测了设备之间的拓扑以及主机与设备之间的拓扑(是否支持跨设备原子操作)。

    ▶ 源代码:Runtime API

      1 #include <memory>
      2 #include <iostream>
      3 #include <cuda_runtime.h>
      4 #include <helper_cuda.h>
      5 
      6 #if CUDART_VERSION < 5000
      7 #include <cuda.h>
      8 
      9 template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)// 将 Driver API 的获取属性函数放到模板中
     10 {
     11     CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
     12     if (CUDA_SUCCESS != error)
     13     {
     14         fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.
    ", error, __FILE__, __LINE__);
     15         exit(EXIT_FAILURE);
     16     }
     17 }
     18 #endif
     19 
     20 int main()
     21 {
     22     printf("Start.
    ");
     23     printf("
    	CUDA Device Query (Runtime API) version (CUDART static linking)
    ");
     24 
     25     int deviceCount;
     26     cudaError_t error_id;
     27     if ((error_id = cudaGetDeviceCount(&deviceCount)) != cudaSuccess)
     28     {
     29         printf("
    cudaGetDeviceCount returned %d
    -> %s
    ", (int)error_id, cudaGetErrorString(error_id));
     30         printf("
    Result = Fail
    ");
     31         exit(EXIT_FAILURE);
     32     }
     33     printf("
    Detected %d CUDA Capable device(s)
    ", deviceCount);
     34 
     35     int dev, driverVersion, runtimeVersion;
     36     for (dev = 0; dev < deviceCount; ++dev)
     37     {
     38         cudaSetDevice(dev);
     39         cudaDeviceProp deviceProp;
     40         cudaGetDeviceProperties(&deviceProp, dev);
     41         printf("
    Device %d: "%s"
    ", dev, deviceProp.name);
     42 
     43         cudaDriverGetVersion(&driverVersion);
     44         cudaRuntimeGetVersion(&runtimeVersion);
     45         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d
    ",
     46             driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
     47         printf("  CUDA Capability Major/Minor version number:    %d.%d
    ", deviceProp.major, deviceProp.minor);
     48         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)
    ",
     49             (float)deviceProp.totalGlobalMem / 1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
     50         printf("  Multiprocessors: %2d, CUDA Cores/MP: %3d        %d CUDA Cores
    ",
     51             deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
     52             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
     53         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)
    ", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
     54 #if CUDART_VERSION >= 5000
     55         printf("  Memory Clock rate:                             %.0f Mhz
    ", deviceProp.memoryClockRate * 1e-3f);
     56         printf("  Memory Bus Width:                              %d-bit
    ", deviceProp.memoryBusWidth);
     57         if (deviceProp.l2CacheSize)
     58             printf("  L2 Cache Size:                                 %d bytes
    ", deviceProp.l2CacheSize);
     59 #else// 在CUDA 4.0 - 4.2 中,需要通过 Driver API 来访问相关属性
     60         int memoryClock;
     61         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
     62         printf("  Memory Clock rate:                             %.0f Mhz
    ", memoryClock * 1e-3f);
     63         int memBusWidth;
     64         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
     65         printf("  Memory Bus Width:                              %d-bit
    ", memBusWidth);
     66         int L2CacheSize;
     67         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
     68         if (L2CacheSize)
     69             printf("  L2 Cache Size:                                 %d bytes
    ", L2CacheSize);
     70 #endif
     71         printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)
    ",
     72             deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
     73             deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
     74         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers
    ",
     75             deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
     76         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers
    ",
     77             deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
     78         printf("  Total amount of constant memory:               %lu bytes
    ", deviceProp.totalConstMem);
     79         printf("  Total amount of shared memory per block:       %lu bytes
    ", deviceProp.sharedMemPerBlock);
     80         printf("  Total number of registers available per block: %d
    ", deviceProp.regsPerBlock);
     81         printf("  Warp size:                                     %d
    ", deviceProp.warpSize);
     82         printf("  Maximum number of threads per multiprocessor:  %d
    ", deviceProp.maxThreadsPerMultiProcessor);
     83         printf("  Maximum number of threads per block:           %d
    ", deviceProp.maxThreadsPerBlock);
     84         printf("  Max dimension size of a thread block (x,y,z):  (%d, %d, %d)
    ",
     85             deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
     86         printf("  Max dimension size of a grid size    (x,y,z):  (%d, %d, %d)
    ",
     87             deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
     88         printf("  Maximum memory pitch:                          %lu bytes
    ", deviceProp.memPitch);
     89         printf("  Texture alignment:                             %lu bytes
    ", deviceProp.textureAlignment);
     90         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)
    ", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
     91         printf("  Run time limit on kernels:                     %s
    ", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
     92         printf("  Integrated GPU sharing Host Memory:            %s
    ", deviceProp.integrated ? "Yes" : "No");
     93         printf("  Support host page-locked memory mapping:       %s
    ", deviceProp.canMapHostMemory ? "Yes" : "No");
     94         printf("  Alignment requirement for Surfaces:            %s
    ", deviceProp.surfaceAlignment ? "Yes" : "No");
     95         printf("  Device has ECC support:                        %s
    ", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
     96 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
     97         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s
    ", deviceProp.tccDriver ?
     98             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
     99 #endif
    100         printf("  Device supports Unified Addressing (UVA):      %s
    ", deviceProp.unifiedAddressing ? "Yes" : "No");
    101         printf("  Supports Cooperative Kernel Launch:            %s
    ", deviceProp.cooperativeLaunch ? "Yes" : "No");
    102         printf("  Supports MultiDevice Co-op Kernel Launch:      %s
    ", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
    103         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d
    ", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
    104 
    105         const char *sComputeMode[] =
    106         {
    107             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
    108             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
    109             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
    110             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
    111             "Unknown",
    112             NULL
    113         };
    114         printf("  Compute Mode: < %s >
    ", sComputeMode[deviceProp.computeMode]);
    115     }
    116 
    117     if (deviceCount >= 2)// 多设备情形,找出最靠前的两张支持 P2P 的设备
    118     {
    119         cudaDeviceProp prop[64];
    120         int gpuid[64], count = 0, can_access_peer;
    121 
    122         for (int i = 0; i < deviceCount; i++)// 在 gpuid 中记录支持 P2P 的设备编号
    123         {
    124             cudaGetDeviceProperties(&prop[i], i);
    125 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)// Windows 系统需要安装 Tesla 计算集群驱动
    126             if ((prop[i].major >= 2) && prop[i].tccDriver)
    127 #else
    128             if ((prop[i].major >= 2))
    129 #endif
    130                 gpuid[count++] = i;
    131         }
    132         if (count >= 2)
    133         {
    134             for (int i = 0; i < count - 1; i++)
    135             {
    136                 for (int j = i + 1; j < count; j++)
    137                 {
    138                     cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
    139                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s
    ",
    140                         prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], can_access_peer ? "Yes" : "No");
    141                 }
    142             }
    143         }
    144     }
    145 
    146     // 设备环境总况
    147     printf("
    ");
    148     std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    149     char cTemp[16];
    150 
    151     sProfileString += ", NumDevs = ";// 设备数
    152 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    153     sprintf_s(cTemp, 10, "%d", deviceCount);
    154 #else
    155     sprintf(cTemp, "%d", deviceCount);
    156 #endif
    157     sProfileString += cTemp;
    158 
    159     sProfileString += ", CUDA Driver Version = ";// Driver 版本
    160 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    161     sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
    162 #else
    163     sprintf(cTemp, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
    164 #endif
    165     sProfileString += cTemp;
    166 
    167     sProfileString += ", CUDA Runtime Version = ";// Runtime 版本
    168 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    169     sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
    170 #else
    171     sprintf(cTemp, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
    172 #endif
    173     sProfileString += cTemp;
    174     printf("
    %s
    ", sProfileString.c_str());
    175 
    176     printf("
    Finish: Result = Pass
    ");
    177     getchar();
    178     return 0;
    179 }

    ▶ 输出结果:

    Start.
    
        CUDA Device Query (Runtime API) version (CUDART static linking)
    
    Detected 1 CUDA Capable device(s)
    
    Device 0: "GeForce GTX 1070"
      CUDA Driver Version / Runtime Version          9.0 / 9.0
      CUDA Capability Major/Minor version number:    6.1
      Total amount of global memory:                 8192 MBytes (8589934592 bytes)
      Multiprocessors: 16, CUDA Cores/MP: 128        2048 CUDA Cores
      GPU Max Clock rate:                            1645 MHz (1.64 GHz)
      Memory Clock rate:                             4004 Mhz
      Memory Bus Width:                              256-bit
      L2 Cache Size:                                 2097152 bytes
      Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
      Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
      Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
      Total amount of constant memory:               65536 bytes
      Total amount of shared memory per block:       49152 bytes
      Total number of registers available per block: 65536
      Warp size:                                     32
      Maximum number of threads per multiprocessor:  2048
      Maximum number of threads per block:           1024
      Max dimension size of a thread block (x,y,z):  (1024, 1024, 64)
      Max dimension size of a grid size    (x,y,z):  (2147483647, 65535, 65535)
      Maximum memory pitch:                          2147483647 bytes
      Texture alignment:                             512 bytes
      Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
      Run time limit on kernels:                     Yes
      Integrated GPU sharing Host Memory:            No
      Support host page-locked memory mapping:       Yes
      Alignment requirement for Surfaces:            Yes
      Device has ECC support:                        Disabled
      CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
      Device supports Unified Addressing (UVA):      Yes
      Supports Cooperative Kernel Launch:            No
      Supports MultiDevice Co-op Kernel Launch:      No
      Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
      Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
    
    
    deviceQuery, CUDA Driver = CUDART, NumDevs = 1, CUDA Driver Version = 9.0, CUDA Runtime Version = 9.0
    
    Finish: Result = Pass

    ▶ 源代码:Driver API

      1 #include <stdio.h>
      2 #include <cuda.h>
      3 #include <helper_cuda_drvapi.h>
      4 
      5 int main(int argc, char **argv)
      6 {
      7     printf("Start.
    ");
      8     printf("CUDA Device Query (Driver API) version (CUDART static linking)
    ");
      9 
     10     CUresult error_id;
     11     if ((error_id = cuInit(0)) != CUDA_SUCCESS)
     12     {
     13         printf("
    cuInit(0) returned %d
    -> %s
    ", error_id, getCudaDrvErrorString(error_id));
     14         printf("
    Result = Fail
    ");
     15         exit(EXIT_FAILURE);
     16     }
     17     int deviceCount = 0;
     18     if ((error_id = cuDeviceGetCount(&deviceCount)) != CUDA_SUCCESS)
     19     {
     20         printf("
    cuDeviceGetCount returned %d
    -> %s
    ", (int)error_id, getCudaDrvErrorString(error_id));
     21         printf("
    Result = FAIL
    ");
     22         exit(EXIT_FAILURE);
     23     }
     24     printf("
    Detected %d CUDA Capable device(s)
    ", deviceCount);
     25     for (CUdevice dev = 0; dev < deviceCount; ++dev)
     26     {
     27         char deviceName[256];
     28         if ((error_id = cuDeviceGetName(deviceName, 256, dev)) != CUDA_SUCCESS)
     29         {
     30             printf("
    cuDeviceGetName returned %d
    -> %s
    ", (int)error_id, getCudaDrvErrorString(error_id));
     31             printf("
    Result = FAIL
    ");
     32             exit(EXIT_FAILURE);
     33         }
     34         printf("
    Device %d: "%s"
    ", dev, deviceName);
     35         int driverVersion;
     36         cuDriverGetVersion(&driverVersion);
     37         printf("  CUDA Driver Version:                           %d.%d
    ", driverVersion/1000, (driverVersion%100)/10);
     38         int major, minor;
     39         if ((error_id = cuDeviceComputeCapability(&major, &minor, dev)) != CUDA_SUCCESS)
     40         {
     41             printf("
    cuDeviceComputeCapability returned %d
    -> %s
    ", (int)error_id, getCudaDrvErrorString(error_id));
     42             printf("
    Result = FAIL
    ");
     43             exit(EXIT_FAILURE);
     44         }
     45         printf("  CUDA Capability Major/Minor version number:    %d.%d
    ", major, minor);
     46         size_t totalGlobalMem;
     47         if ((error_id = cuDeviceTotalMem(&totalGlobalMem, dev)) != CUDA_SUCCESS)
     48         {
     49             printf("cuDeviceTotalMem returned %d
    -> %s
    ", (int)error_id, getCudaDrvErrorString(error_id));
     50             printf("Result = FAIL
    ");
     51             exit(EXIT_FAILURE);
     52         }
     53         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)
    ",
     54             (float)totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem);
     55         int multiProcessorCount;
     56         getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     57         printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores
    ", 
     58             multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
     59         int clockRate;
     60         getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
     61         printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f GHz)
    ", clockRate * 1e-3f, clockRate * 1e-6f);
     62         int memoryClock;
     63         getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
     64         printf("  Memory Clock rate:                             %.0f Mhz
    ", memoryClock * 1e-3f);
     65         int memBusWidth;
     66         getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
     67         printf("  Memory Bus Width:                              %d-bit
    ", memBusWidth);
     68         int L2CacheSize;
     69         getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
     70         if (L2CacheSize)
     71             printf("  L2 Cache Size:                                 %d bytes
    ", L2CacheSize);
     72         int maxTex1D, maxTex2D[2], maxTex3D[3];
     73         getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
     74         getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
     75         getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
     76         getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
     77         getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
     78         getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
     79         printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d)
    ", 
     80             maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]);
     81         int  maxTex1DLayered[2];
     82         getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
     83         getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
     84         printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers
    ", maxTex1DLayered[0], maxTex1DLayered[1]);
     85         int  maxTex2DLayered[3];
     86         getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
     87         getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
     88         getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
     89         printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d layers
    ",
     90             maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
     91         int totalConstantMemory;
     92         getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
     93         printf("  Total amount of constant memory:               %u bytes
    ", totalConstantMemory);
     94         int sharedMemPerBlock;
     95         getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
     96         printf("  Total amount of shared memory per block:       %u bytes
    ", sharedMemPerBlock);
     97         int regsPerBlock;
     98         getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
     99         printf("  Total number of registers available per block: %d
    ", regsPerBlock);
    100         int warpSize;
    101         getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
    102         printf("  Warp size:                                     %d
    ", warpSize);
    103         int maxThreadsPerMultiProcessor;
    104         getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
    105         printf("  Maximum number of threads per multiprocessor:  %d
    ", maxThreadsPerMultiProcessor);
    106         int maxThreadsPerBlock;
    107         getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
    108         printf("  Maximum number of threads per block:           %d
    ", maxThreadsPerBlock);
    109         int blockDim[3];
    110         getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
    111         getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
    112         getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
    113         printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)
    ", blockDim[0], blockDim[1], blockDim[2]);
    114         int gridDim[3];
    115         getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
    116         getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
    117         getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
    118         printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)
    ", gridDim[0], gridDim[1], gridDim[2]);
    119         int textureAlign;
    120         getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
    121         printf("  Texture alignment:                             %u bytes
    ", textureAlign);
    122         int memPitch;
    123         getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
    124         printf("  Maximum memory pitch:                          %u bytes
    ", memPitch);
    125         int gpuOverlap;
    126         getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
    127         int asyncEngineCount;
    128         getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
    129         printf("  Concurrent copy and kernel execution:          %s with %d copy engine(s)
    ", (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
    130         int kernelExecTimeoutEnabled;
    131         getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
    132         printf("  Run time limit on kernels:                     %s
    ", kernelExecTimeoutEnabled ? "Yes" : "No");
    133         int integrated;
    134         getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
    135         printf("  Integrated GPU sharing Host Memory:            %s
    ", integrated ? "Yes" : "No");
    136         int canMapHostMemory;
    137         getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
    138         printf("  Support host page-locked memory mapping:       %s
    ", canMapHostMemory ? "Yes" : "No");
    139         int concurrentKernels;
    140         getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
    141         printf("  Concurrent kernel execution:                   %s
    ", concurrentKernels ? "Yes" : "No");
    142         int surfaceAlignment;
    143         getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
    144         printf("  Alignment requirement for Surfaces:            %s
    ", surfaceAlignment ? "Yes" : "No");
    145         int eccEnabled;
    146         getCudaAttribute<int>(&eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
    147         printf("  Device has ECC support:                        %s
    ", eccEnabled ? "Enabled" : "Disabled");
    148 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    149         int tccDriver ;
    150         getCudaAttribute<int>(&tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
    151         printf("  CUDA Device Driver Mode (TCC or WDDM):         %s
    ", tccDriver ?
    152             "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
    153 #endif
    154         int unifiedAddressing;
    155         getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    156         printf("  Device supports Unified Addressing (UVA):      %s
    ", unifiedAddressing ? "Yes" : "No");
    157         int cooperativeLaunch;
    158         getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
    159         printf("  Supports Cooperative Kernel Launch:            %s
    ", cooperativeLaunch ? "Yes" : "No");
    160         int cooperativeMultiDevLaunch;
    161         getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
    162         printf("  Supports MultiDevice Co-op Kernel Launch:      %s
    ", cooperativeMultiDevLaunch ? "Yes" : "No");
    163         int pciDomainID, pciBusID, pciDeviceID;
    164         getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
    165         getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
    166         getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
    167         printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d
    ", pciDomainID, pciBusID, pciDeviceID);
    168         
    169         const char *sComputeMode[] =
    170         {
    171             "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
    172             "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
    173             "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
    174             "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
    175             "Unknown",
    176             NULL
    177         };
    178         int computeMode;
    179         getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
    180         printf("  Compute Mode: < %s >
    ", sComputeMode[computeMode]);
    181     }
    182 
    183     if (deviceCount >= 2)// 多设备情形
    184     {
    185         int gpuid[64], count = 0, major, minor, tccDriver, can_access_peer;
    186         for (int i = 0; i < deviceCount; i++)
    187         {
    188             cuDeviceComputeCapability(&major, &minor, i);
    189             getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
    190 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    191             if ((major >= 2) && tccDriver)
    192 #else
    193             if ((major >= 2))
    194 #endif
    195                 gpuid[count++] = i;
    196         }
    197         if (count >= 2)
    198         {
    199             char deviceName0[256], deviceName1[256]; 
    200             for (int i = 0; i < count - 1; i++)
    201             {
    202                 for (int j = i + 1; j < count; j++)
    203                 {
    204                     cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
    205                     cuDeviceGetName(deviceName0, 256, gpuid[i]);
    206                     cuDeviceGetName(deviceName1, 256, gpuid[j]);
    207                     printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s
    ",
    208                         deviceName0, gpuid[i], deviceName1, gpuid[j], can_access_peer ? "Yes" : "No"); 
    209                 }
    210             }
    211         }
    212     }
    213 
    214     printf("
    Finish: Result = Pass
    ");
    215     getchar();
    216     return 0;
    217 }

    ▶ 输出结果:

    Start.
    CUDA Device Query (Driver API) version (CUDART static linking)
    
    Detected 1 CUDA Capable device(s)
    
    Device 0: "GeForce GTX 1070"
      CUDA Driver Version:                           9.0
      CUDA Capability Major/Minor version number:    6.1
      Total amount of global memory:                 8192 MBytes (8589934592 bytes)
      (16) Multiprocessors, (128) CUDA Cores/MP:     2048 CUDA Cores
      GPU Max Clock rate:                            1645 MHz (1.64 GHz)
      Memory Clock rate:                             4004 Mhz
      Memory Bus Width:                              256-bit
      L2 Cache Size:                                 2097152 bytes
      Max Texture Dimension Sizes                    1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
      Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
      Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
      Total amount of constant memory:               65536 bytes
      Total amount of shared memory per block:       49152 bytes
      Total number of registers available per block: 65536
      Warp size:                                     32
      Maximum number of threads per multiprocessor:  2048
      Maximum number of threads per block:           1024
      Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
      Max dimension size of a grid size (x,y,z):    (2147483647, 65535, 65535)
      Texture alignment:                             512 bytes
      Maximum memory pitch:                          2147483647 bytes
      Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
      Run time limit on kernels:                     Yes
      Integrated GPU sharing Host Memory:            No
      Support host page-locked memory mapping:       Yes
      Concurrent kernel execution:                   Yes
      Alignment requirement for Surfaces:            Yes
      Device has ECC support:                        Disabled
      CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
      Device supports Unified Addressing (UVA):      Yes
      Supports Cooperative Kernel Launch:            No
      Supports MultiDevice Co-op Kernel Launch:      No
      Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
      Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
    
    Finish: Result = Pass

    ▶ 源代码:topologyQuery

     1 #include <cuda_runtime.h>
     2 #include <helper_cuda.h>
     3 #include <helper_functions.h>s
     4 
     5 int main()
     6 {
     7     int deviceCount;
     8     cudaGetDeviceCount(&deviceCount);
     9     for (int device1 = 0; device1 < deviceCount - 1; device1++)// 设备间拓扑
    10     {
    11         for (int device2 = device1 + 1; device2 < deviceCount; device2++)
    12         {            
    13             int perfRank = 0;
    14             int atomicSupported = 0;
    15             int accessSupported = 0;
    16             cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2);
    17             cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2);
    18             cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2);
    19             if (accessSupported)
    20             {
    21                 std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
    22                 std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
    23                 std::cout << "  * Perf Rank: " << perfRank << std::endl;
    24             }
    25         }
    26     }
    27     for (int device = 0; device < deviceCount; device++)// 设备与主机间间拓扑
    28     {
    29         int atomicSupported;
    30         cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device);
    31         std::cout << "GPU" << device << " <-> CPU:" << std::endl;
    32         std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
    33     }
    34     getchar();
    35     return 0;
    36 }

    ▶ 输出结果:

    GPU0 <-> CPU:
      * Atomic Supported: no

    ▶ 涨姿势:

    ● Runtime API 比 Driver API 写起来更简单,且能直接检测的内容不少于 Driver API。

    ● 用到的 Runtime API 函数和 Driver API 函数。

     1 // cuda_runtime_api.h
     2 extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
     3 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
     4 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
     5 
     6 // cuda_device_runtime_api.h
     7 #define __NV_WEAK__ __declspec(nv_weak)
     8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
     9 
    10 // cuda.h
    11 CUresult CUDAAPI cuDeviceGetCount(int *count);
    12 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
    13 CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
    14 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
  • 相关阅读:
    (笔记)Linux内核学习(二)之进程
    (笔记)Linux内核学习(一)之内核介绍
    状态机思路在程序设计中的应用
    内存操作函数memmove,memcpy,memset
    linux下常用的几个时间函数:time,gettimeofday,clock_gettime,_ftime
    Camera ISO、快门、光圈、曝光这几个概念
    C语言中的指针和内存泄漏几种情况
    音视频文件码率与大小计算
    CC++中 fopen中文件打开方式的区别:
    常用DOS命令
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/8040034.html
Copyright © 2011-2022 走看看