使用 Runtime API 和 Driver API 检测设备相关属性。并检测了设备之间的拓扑以及主机与设备之间的拓扑(是否支持跨设备原子操作)。
▶ 源代码:Runtime API
1 #include <memory> 2 #include <iostream> 3 #include <cuda_runtime.h> 4 #include <helper_cuda.h> 5 6 #if CUDART_VERSION < 5000 7 #include <cuda.h> 8 9 template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)// 将 Driver API 的获取属性函数放到模板中 10 { 11 CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device); 12 if (CUDA_SUCCESS != error) 13 { 14 fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i. ", error, __FILE__, __LINE__); 15 exit(EXIT_FAILURE); 16 } 17 } 18 #endif 19 20 int main() 21 { 22 printf("Start. "); 23 printf(" CUDA Device Query (Runtime API) version (CUDART static linking) "); 24 25 int deviceCount; 26 cudaError_t error_id; 27 if ((error_id = cudaGetDeviceCount(&deviceCount)) != cudaSuccess) 28 { 29 printf(" cudaGetDeviceCount returned %d -> %s ", (int)error_id, cudaGetErrorString(error_id)); 30 printf(" Result = Fail "); 31 exit(EXIT_FAILURE); 32 } 33 printf(" Detected %d CUDA Capable device(s) ", deviceCount); 34 35 int dev, driverVersion, runtimeVersion; 36 for (dev = 0; dev < deviceCount; ++dev) 37 { 38 cudaSetDevice(dev); 39 cudaDeviceProp deviceProp; 40 cudaGetDeviceProperties(&deviceProp, dev); 41 printf(" Device %d: "%s" ", dev, deviceProp.name); 42 43 cudaDriverGetVersion(&driverVersion); 44 cudaRuntimeGetVersion(&runtimeVersion); 45 printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d ", 46 driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10); 47 printf(" CUDA Capability Major/Minor version number: %d.%d ", deviceProp.major, deviceProp.minor); 48 printf(" Total amount of global memory: %.0f MBytes (%llu bytes) ", 49 (float)deviceProp.totalGlobalMem / 1048576.0f, (unsigned long long) deviceProp.totalGlobalMem); 50 printf(" Multiprocessors: %2d, CUDA Cores/MP: %3d %d CUDA Cores ", 51 deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), 52 _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount); 53 printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz) ", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f); 54 #if CUDART_VERSION >= 5000 55 printf(" Memory Clock rate: %.0f Mhz ", deviceProp.memoryClockRate * 1e-3f); 56 printf(" Memory Bus Width: %d-bit ", deviceProp.memoryBusWidth); 57 if (deviceProp.l2CacheSize) 58 printf(" L2 Cache Size: %d bytes ", deviceProp.l2CacheSize); 59 #else// 在CUDA 4.0 - 4.2 中,需要通过 Driver API 来访问相关属性 60 int memoryClock; 61 getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); 62 printf(" Memory Clock rate: %.0f Mhz ", memoryClock * 1e-3f); 63 int memBusWidth; 64 getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); 65 printf(" Memory Bus Width: %d-bit ", memBusWidth); 66 int L2CacheSize; 67 getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); 68 if (L2CacheSize) 69 printf(" L2 Cache Size: %d bytes ", L2CacheSize); 70 #endif 71 printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d) ", 72 deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], 73 deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]); 74 printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers ", 75 deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]); 76 printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers ", 77 deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]); 78 printf(" Total amount of constant memory: %lu bytes ", deviceProp.totalConstMem); 79 printf(" Total amount of shared memory per block: %lu bytes ", deviceProp.sharedMemPerBlock); 80 printf(" Total number of registers available per block: %d ", deviceProp.regsPerBlock); 81 printf(" Warp size: %d ", deviceProp.warpSize); 82 printf(" Maximum number of threads per multiprocessor: %d ", deviceProp.maxThreadsPerMultiProcessor); 83 printf(" Maximum number of threads per block: %d ", deviceProp.maxThreadsPerBlock); 84 printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d) ", 85 deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]); 86 printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d) ", 87 deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]); 88 printf(" Maximum memory pitch: %lu bytes ", deviceProp.memPitch); 89 printf(" Texture alignment: %lu bytes ", deviceProp.textureAlignment); 90 printf(" Concurrent copy and kernel execution: %s with %d copy engine(s) ", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount); 91 printf(" Run time limit on kernels: %s ", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No"); 92 printf(" Integrated GPU sharing Host Memory: %s ", deviceProp.integrated ? "Yes" : "No"); 93 printf(" Support host page-locked memory mapping: %s ", deviceProp.canMapHostMemory ? "Yes" : "No"); 94 printf(" Alignment requirement for Surfaces: %s ", deviceProp.surfaceAlignment ? "Yes" : "No"); 95 printf(" Device has ECC support: %s ", deviceProp.ECCEnabled ? "Enabled" : "Disabled"); 96 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 97 printf(" CUDA Device Driver Mode (TCC or WDDM): %s ", deviceProp.tccDriver ? 98 "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); 99 #endif 100 printf(" Device supports Unified Addressing (UVA): %s ", deviceProp.unifiedAddressing ? "Yes" : "No"); 101 printf(" Supports Cooperative Kernel Launch: %s ", deviceProp.cooperativeLaunch ? "Yes" : "No"); 102 printf(" Supports MultiDevice Co-op Kernel Launch: %s ", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No"); 103 printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d ", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID); 104 105 const char *sComputeMode[] = 106 { 107 "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", 108 "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", 109 "Prohibited (no host thread can use ::cudaSetDevice() with this device)", 110 "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", 111 "Unknown", 112 NULL 113 }; 114 printf(" Compute Mode: < %s > ", sComputeMode[deviceProp.computeMode]); 115 } 116 117 if (deviceCount >= 2)// 多设备情形,找出最靠前的两张支持 P2P 的设备 118 { 119 cudaDeviceProp prop[64]; 120 int gpuid[64], count = 0, can_access_peer; 121 122 for (int i = 0; i < deviceCount; i++)// 在 gpuid 中记录支持 P2P 的设备编号 123 { 124 cudaGetDeviceProperties(&prop[i], i); 125 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)// Windows 系统需要安装 Tesla 计算集群驱动 126 if ((prop[i].major >= 2) && prop[i].tccDriver) 127 #else 128 if ((prop[i].major >= 2)) 129 #endif 130 gpuid[count++] = i; 131 } 132 if (count >= 2) 133 { 134 for (int i = 0; i < count - 1; i++) 135 { 136 for (int j = i + 1; j < count; j++) 137 { 138 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]); 139 printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s ", 140 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j], can_access_peer ? "Yes" : "No"); 141 } 142 } 143 } 144 } 145 146 // 设备环境总况 147 printf(" "); 148 std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; 149 char cTemp[16]; 150 151 sProfileString += ", NumDevs = ";// 设备数 152 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 153 sprintf_s(cTemp, 10, "%d", deviceCount); 154 #else 155 sprintf(cTemp, "%d", deviceCount); 156 #endif 157 sProfileString += cTemp; 158 159 sProfileString += ", CUDA Driver Version = ";// Driver 版本 160 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 161 sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10); 162 #else 163 sprintf(cTemp, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10); 164 #endif 165 sProfileString += cTemp; 166 167 sProfileString += ", CUDA Runtime Version = ";// Runtime 版本 168 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 169 sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10); 170 #else 171 sprintf(cTemp, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10); 172 #endif 173 sProfileString += cTemp; 174 printf(" %s ", sProfileString.c_str()); 175 176 printf(" Finish: Result = Pass "); 177 getchar(); 178 return 0; 179 }
▶ 输出结果:
Start. CUDA Device Query (Runtime API) version (CUDART static linking) Detected 1 CUDA Capable device(s) Device 0: "GeForce GTX 1070" CUDA Driver Version / Runtime Version 9.0 / 9.0 CUDA Capability Major/Minor version number: 6.1 Total amount of global memory: 8192 MBytes (8589934592 bytes) Multiprocessors: 16, CUDA Cores/MP: 128 2048 CUDA Cores GPU Max Clock rate: 1645 MHz (1.64 GHz) Memory Clock rate: 4004 Mhz Memory Bus Width: 256-bit L2 Cache Size: 2097152 bytes Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384) Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) Maximum memory pitch: 2147483647 bytes Texture alignment: 512 bytes Concurrent copy and kernel execution: Yes with 2 copy engine(s) Run time limit on kernels: Yes Integrated GPU sharing Host Memory: No Support host page-locked memory mapping: Yes Alignment requirement for Surfaces: Yes Device has ECC support: Disabled CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model) Device supports Unified Addressing (UVA): Yes Supports Cooperative Kernel Launch: No Supports MultiDevice Co-op Kernel Launch: No Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > deviceQuery, CUDA Driver = CUDART, NumDevs = 1, CUDA Driver Version = 9.0, CUDA Runtime Version = 9.0 Finish: Result = Pass
▶ 源代码:Driver API
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <helper_cuda_drvapi.h> 4 5 int main(int argc, char **argv) 6 { 7 printf("Start. "); 8 printf("CUDA Device Query (Driver API) version (CUDART static linking) "); 9 10 CUresult error_id; 11 if ((error_id = cuInit(0)) != CUDA_SUCCESS) 12 { 13 printf(" cuInit(0) returned %d -> %s ", error_id, getCudaDrvErrorString(error_id)); 14 printf(" Result = Fail "); 15 exit(EXIT_FAILURE); 16 } 17 int deviceCount = 0; 18 if ((error_id = cuDeviceGetCount(&deviceCount)) != CUDA_SUCCESS) 19 { 20 printf(" cuDeviceGetCount returned %d -> %s ", (int)error_id, getCudaDrvErrorString(error_id)); 21 printf(" Result = FAIL "); 22 exit(EXIT_FAILURE); 23 } 24 printf(" Detected %d CUDA Capable device(s) ", deviceCount); 25 for (CUdevice dev = 0; dev < deviceCount; ++dev) 26 { 27 char deviceName[256]; 28 if ((error_id = cuDeviceGetName(deviceName, 256, dev)) != CUDA_SUCCESS) 29 { 30 printf(" cuDeviceGetName returned %d -> %s ", (int)error_id, getCudaDrvErrorString(error_id)); 31 printf(" Result = FAIL "); 32 exit(EXIT_FAILURE); 33 } 34 printf(" Device %d: "%s" ", dev, deviceName); 35 int driverVersion; 36 cuDriverGetVersion(&driverVersion); 37 printf(" CUDA Driver Version: %d.%d ", driverVersion/1000, (driverVersion%100)/10); 38 int major, minor; 39 if ((error_id = cuDeviceComputeCapability(&major, &minor, dev)) != CUDA_SUCCESS) 40 { 41 printf(" cuDeviceComputeCapability returned %d -> %s ", (int)error_id, getCudaDrvErrorString(error_id)); 42 printf(" Result = FAIL "); 43 exit(EXIT_FAILURE); 44 } 45 printf(" CUDA Capability Major/Minor version number: %d.%d ", major, minor); 46 size_t totalGlobalMem; 47 if ((error_id = cuDeviceTotalMem(&totalGlobalMem, dev)) != CUDA_SUCCESS) 48 { 49 printf("cuDeviceTotalMem returned %d -> %s ", (int)error_id, getCudaDrvErrorString(error_id)); 50 printf("Result = FAIL "); 51 exit(EXIT_FAILURE); 52 } 53 printf(" Total amount of global memory: %.0f MBytes (%llu bytes) ", 54 (float)totalGlobalMem / 1048576.0f, (unsigned long long) totalGlobalMem); 55 int multiProcessorCount; 56 getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); 57 printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores ", 58 multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor), _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount); 59 int clockRate; 60 getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); 61 printf(" GPU Max Clock rate: %.0f MHz (%0.2f GHz) ", clockRate * 1e-3f, clockRate * 1e-6f); 62 int memoryClock; 63 getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev); 64 printf(" Memory Clock rate: %.0f Mhz ", memoryClock * 1e-3f); 65 int memBusWidth; 66 getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); 67 printf(" Memory Bus Width: %d-bit ", memBusWidth); 68 int L2CacheSize; 69 getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); 70 if (L2CacheSize) 71 printf(" L2 Cache Size: %d bytes ", L2CacheSize); 72 int maxTex1D, maxTex2D[2], maxTex3D[3]; 73 getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev); 74 getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev); 75 getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev); 76 getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev); 77 getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev); 78 getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev); 79 printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) 3D=(%d, %d, %d) ", 80 maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1], maxTex3D[2]); 81 int maxTex1DLayered[2]; 82 getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev); 83 getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev); 84 printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers ", maxTex1DLayered[0], maxTex1DLayered[1]); 85 int maxTex2DLayered[3]; 86 getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev); 87 getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev); 88 getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev); 89 printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d layers ", 90 maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]); 91 int totalConstantMemory; 92 getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev); 93 printf(" Total amount of constant memory: %u bytes ", totalConstantMemory); 94 int sharedMemPerBlock; 95 getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev); 96 printf(" Total amount of shared memory per block: %u bytes ", sharedMemPerBlock); 97 int regsPerBlock; 98 getCudaAttribute<int>(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); 99 printf(" Total number of registers available per block: %d ", regsPerBlock); 100 int warpSize; 101 getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev); 102 printf(" Warp size: %d ", warpSize); 103 int maxThreadsPerMultiProcessor; 104 getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); 105 printf(" Maximum number of threads per multiprocessor: %d ", maxThreadsPerMultiProcessor); 106 int maxThreadsPerBlock; 107 getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); 108 printf(" Maximum number of threads per block: %d ", maxThreadsPerBlock); 109 int blockDim[3]; 110 getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev); 111 getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev); 112 getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev); 113 printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d) ", blockDim[0], blockDim[1], blockDim[2]); 114 int gridDim[3]; 115 getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev); 116 getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev); 117 getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev); 118 printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d) ", gridDim[0], gridDim[1], gridDim[2]); 119 int textureAlign; 120 getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev); 121 printf(" Texture alignment: %u bytes ", textureAlign); 122 int memPitch; 123 getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev); 124 printf(" Maximum memory pitch: %u bytes ", memPitch); 125 int gpuOverlap; 126 getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); 127 int asyncEngineCount; 128 getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); 129 printf(" Concurrent copy and kernel execution: %s with %d copy engine(s) ", (gpuOverlap ? "Yes" : "No"), asyncEngineCount); 130 int kernelExecTimeoutEnabled; 131 getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev); 132 printf(" Run time limit on kernels: %s ", kernelExecTimeoutEnabled ? "Yes" : "No"); 133 int integrated; 134 getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); 135 printf(" Integrated GPU sharing Host Memory: %s ", integrated ? "Yes" : "No"); 136 int canMapHostMemory; 137 getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev); 138 printf(" Support host page-locked memory mapping: %s ", canMapHostMemory ? "Yes" : "No"); 139 int concurrentKernels; 140 getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev); 141 printf(" Concurrent kernel execution: %s ", concurrentKernels ? "Yes" : "No"); 142 int surfaceAlignment; 143 getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev); 144 printf(" Alignment requirement for Surfaces: %s ", surfaceAlignment ? "Yes" : "No"); 145 int eccEnabled; 146 getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev); 147 printf(" Device has ECC support: %s ", eccEnabled ? "Enabled" : "Disabled"); 148 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 149 int tccDriver ; 150 getCudaAttribute<int>(&tccDriver , CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev); 151 printf(" CUDA Device Driver Mode (TCC or WDDM): %s ", tccDriver ? 152 "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)"); 153 #endif 154 int unifiedAddressing; 155 getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev); 156 printf(" Device supports Unified Addressing (UVA): %s ", unifiedAddressing ? "Yes" : "No"); 157 int cooperativeLaunch; 158 getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev); 159 printf(" Supports Cooperative Kernel Launch: %s ", cooperativeLaunch ? "Yes" : "No"); 160 int cooperativeMultiDevLaunch; 161 getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev); 162 printf(" Supports MultiDevice Co-op Kernel Launch: %s ", cooperativeMultiDevLaunch ? "Yes" : "No"); 163 int pciDomainID, pciBusID, pciDeviceID; 164 getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev); 165 getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev); 166 getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev); 167 printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d ", pciDomainID, pciBusID, pciDeviceID); 168 169 const char *sComputeMode[] = 170 { 171 "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)", 172 "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)", 173 "Prohibited (no host thread can use ::cudaSetDevice() with this device)", 174 "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)", 175 "Unknown", 176 NULL 177 }; 178 int computeMode; 179 getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); 180 printf(" Compute Mode: < %s > ", sComputeMode[computeMode]); 181 } 182 183 if (deviceCount >= 2)// 多设备情形 184 { 185 int gpuid[64], count = 0, major, minor, tccDriver, can_access_peer; 186 for (int i = 0; i < deviceCount; i++) 187 { 188 cuDeviceComputeCapability(&major, &minor, i); 189 getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i); 190 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 191 if ((major >= 2) && tccDriver) 192 #else 193 if ((major >= 2)) 194 #endif 195 gpuid[count++] = i; 196 } 197 if (count >= 2) 198 { 199 char deviceName0[256], deviceName1[256]; 200 for (int i = 0; i < count - 1; i++) 201 { 202 for (int j = i + 1; j < count; j++) 203 { 204 cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]); 205 cuDeviceGetName(deviceName0, 256, gpuid[i]); 206 cuDeviceGetName(deviceName1, 256, gpuid[j]); 207 printf("> Peer access between %s (GPU%d) -> %s (GPU%d) : %s ", 208 deviceName0, gpuid[i], deviceName1, gpuid[j], can_access_peer ? "Yes" : "No"); 209 } 210 } 211 } 212 } 213 214 printf(" Finish: Result = Pass "); 215 getchar(); 216 return 0; 217 }
▶ 输出结果:
Start. CUDA Device Query (Driver API) version (CUDART static linking) Detected 1 CUDA Capable device(s) Device 0: "GeForce GTX 1070" CUDA Driver Version: 9.0 CUDA Capability Major/Minor version number: 6.1 Total amount of global memory: 8192 MBytes (8589934592 bytes) (16) Multiprocessors, (128) CUDA Cores/MP: 2048 CUDA Cores GPU Max Clock rate: 1645 MHz (1.64 GHz) Memory Clock rate: 4004 Mhz Memory Bus Width: 256-bit L2 Cache Size: 2097152 bytes Max Texture Dimension Sizes 1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384) Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers Total amount of constant memory: 65536 bytes Total amount of shared memory per block: 49152 bytes Total number of registers available per block: 65536 Warp size: 32 Maximum number of threads per multiprocessor: 2048 Maximum number of threads per block: 1024 Max dimension size of a thread block (x,y,z): (1024, 1024, 64) Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) Texture alignment: 512 bytes Maximum memory pitch: 2147483647 bytes Concurrent copy and kernel execution: Yes with 2 copy engine(s) Run time limit on kernels: Yes Integrated GPU sharing Host Memory: No Support host page-locked memory mapping: Yes Concurrent kernel execution: Yes Alignment requirement for Surfaces: Yes Device has ECC support: Disabled CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model) Device supports Unified Addressing (UVA): Yes Supports Cooperative Kernel Launch: No Supports MultiDevice Co-op Kernel Launch: No Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 Compute Mode: < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > Finish: Result = Pass
▶ 源代码:topologyQuery
1 #include <cuda_runtime.h> 2 #include <helper_cuda.h> 3 #include <helper_functions.h>s 4 5 int main() 6 { 7 int deviceCount; 8 cudaGetDeviceCount(&deviceCount); 9 for (int device1 = 0; device1 < deviceCount - 1; device1++)// 设备间拓扑 10 { 11 for (int device2 = device1 + 1; device2 < deviceCount; device2++) 12 { 13 int perfRank = 0; 14 int atomicSupported = 0; 15 int accessSupported = 0; 16 cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2); 17 cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2); 18 cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2); 19 if (accessSupported) 20 { 21 std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl; 22 std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl; 23 std::cout << " * Perf Rank: " << perfRank << std::endl; 24 } 25 } 26 } 27 for (int device = 0; device < deviceCount; device++)// 设备与主机间间拓扑 28 { 29 int atomicSupported; 30 cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device); 31 std::cout << "GPU" << device << " <-> CPU:" << std::endl; 32 std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl; 33 } 34 getchar(); 35 return 0; 36 }
▶ 输出结果:
GPU0 <-> CPU:
* Atomic Supported: no
▶ 涨姿势:
● Runtime API 比 Driver API 写起来更简单,且能直接检测的内容不少于 Driver API。
● 用到的 Runtime API 函数和 Driver API 函数。
1 // cuda_runtime_api.h 2 extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion); 3 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); 4 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice); 5 6 // cuda_device_runtime_api.h 7 #define __NV_WEAK__ __declspec(nv_weak) 8 __device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) 9 10 // cuda.h 11 CUresult CUDAAPI cuDeviceGetCount(int *count); 12 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); 13 CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); 14 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);