zoukankan      html  css  js  c++  java
  • 0_Simple__simpleP2P

    使用 P2P 特性在 GPU 之间传输、读写数据。

    ▶ 源代码。包括 P2P 使用前的各项检查,设备之间的数据互拷,主机和设备之间数据传输和相互访问。

      1 #include <stdlib.h>
      2 #include <stdio.h>
      3 #include <cuda_runtime.h>
      4 #include "device_launch_parameters.h"
      5 #include <helper_cuda.h>
      6 #include <helper_functions.h>
      7 
      8 #define MAX_GPU_COUNT 64
      9 
     10 __global__ void SimpleKernel(float *src, float *dst)
     11 {   
     12     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
     13     dst[idx] = src[idx] * 2.0f;
     14 }
     15 
     16 inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
     17 {
     18 #ifdef _WIN32
     19     return (bool)(pProp->tccDriver ? true : false);
     20 #else
     21     return (bool)(pProp->major >= 2);
     22 #endif
     23 }
     24 
     25 int main(int argc, char **argv)
     26 {
     27     printf("
    	Starting
    ", argv[0]);
     28 
     29     // 检查是否使用 64 位操作系统环境
     30     if (sizeof(void*) != 8)
     31     {
     32         printf("
    	Error for program only supported with 64-bit OS and 64-bit target
    ");
     33         return EXIT_WAIVED;
     34     }
     35     
     36     // 找到头两块计算能力不小于 2.0 的设备
     37     int gpu_n;
     38     cudaGetDeviceCount(&gpu_n);
     39     printf("
    	Device count: %d
    ", gpu_n);
     40     if (gpu_n < 2)
     41     {
     42         printf("
    	Error for two or more GPUs with SM2.0 required
    ");
     43         return EXIT_WAIVED;
     44     }
     45     
     46     cudaDeviceProp prop[MAX_GPU_COUNT];
     47     int gpuid[MAX_GPU_COUNT], gpu_count = 0;
     48     printf("
    	Show device
    ");// 展示所有设备
     49     for (int i=0; i < gpu_n; i++)
     50     {
     51         cudaGetDeviceProperties(&prop[i], i);
     52         if ((prop[i].major >= 2)
     53 #ifdef _WIN32
     54             && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动
     55 #endif
     56            )    
     57             gpuid[gpu_count++] = i;
     58         printf("
    	GPU%d = "%15s" ---- %s
    ", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));
     59     }
     60     if (gpu_count < 2)
     61     {
     62         printf("
    	Error for two or more GPUs with SM2.0 required
    ");
     63 #ifdef _WIN32
     64         printf("
    Or for TCC driver required
    ");
     65 #endif
     66         cudaSetDevice(0);
     67         return EXIT_WAIVED;
     68     }
     69 
     70     // 寻找测试设备
     71     int can_access_peer, p2pCapableGPUs[2];
     72     p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
     73     printf("
    	Show combination of devices with P2P
    ");// 展示所有能 P2P 的设备组合
     74     for (int i = 0; i < gpu_count - 1; i++)
     75     {
     76         for (int j = i + 1; j < gpu_count; j++)
     77         {
     78             cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
     79             if (can_access_peer)
     80             {
     81                 printf("
    	GPU%d (%s) <--> GPU%d (%s) : %s
    ", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);
     82                 if (p2pCapableGPUs[0] == -1)
     83                     p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j];
     84             }
     85         }
     86     }
     87     if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1)
     88     {
     89         printf("
    	Error for P2P not available among GPUs
    ");
     90         for (int i=0; i < gpu_count; i++)
     91             cudaSetDevice(gpuid[i]);
     92         return EXIT_WAIVED;
     93     }
     94 
     95     // 使用找到的设备进行测试
     96     gpuid[0] = p2pCapableGPUs[0];
     97     gpuid[1] = p2pCapableGPUs[1];
     98     printf("
    	Enabling P2P between GPU%d and GPU%d
    ", gpuid[0], gpuid[1]);
     99     
    100     // 启用 P2P
    101     cudaSetDevice(gpuid[0]);
    102     cudaDeviceEnablePeerAccess(gpuid[1], 0);
    103     cudaSetDevice(gpuid[1]);
    104     cudaDeviceEnablePeerAccess(gpuid[0], 0);
    105 
    106     // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space,UVA)
    107     if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing))
    108         printf("
    	Error for GPU not support UVA
    ");
    109         return EXIT_WAIVED;
    110 
    111     // 申请内存
    112     const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    113     printf("
    	Allocating buffers %iMB
    ", int(buf_size / 1024 / 1024));
    114     cudaSetDevice(gpuid[0]);
    115     float *g0;
    116     cudaMalloc(&g0, buf_size);
    117     cudaSetDevice(gpuid[1]);
    118     float *g1;
    119     cudaMalloc(&g1, buf_size);
    120     float *h0;
    121     cudaMallocHost(&h0, buf_size);
    122 
    123     cudaEvent_t start_event, stop_event;
    124     int eventflags = cudaEventBlockingSync;
    125     float time_memcpy;
    126     cudaEventCreateWithFlags(&start_event, eventflags);
    127     cudaEventCreateWithFlags(&stop_event, eventflags);
    128     cudaEventRecord(start_event, 0);
    129 
    130     for (int i=0; i<100; i++)
    131     {
    132         // GPU 互拷
    133         // UVA 特性下 cudaMemcpyDefault 直接根据指针(属于主机还是设备)来确定拷贝方向
    134         if (i % 2 == 0)
    135             cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
    136         else
    137             cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);
    138     }
    139     cudaEventRecord(stop_event, 0);
    140     cudaEventSynchronize(stop_event);
    141     cudaEventElapsedTime(&time_memcpy, start_event, stop_event);
    142     printf("
    	cudaMemcpy: %.2fGB/s
    ", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));
    143 
    144     for (int i=0; i<buf_size / sizeof(float); i++)
    145         h0[i] = float(i % 4096);
    146     cudaSetDevice(gpuid[0]);
    147     cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);
    148 
    149     const dim3 threads(512, 1);
    150     const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
    151 
    152     // 使用 GPU1 读取 GPU0 的全局内存数据,计算并写入 GPU1 的全局内存
    153     printf("
    	Run kernel on GPU%d, reading data from GPU%d and writing to GPU%d
    ", gpuid[1], gpuid[0], gpuid[1]);
    154     cudaSetDevice(gpuid[1]);
    155     SimpleKernel<<<blocks, threads>>>(g0, g1);
    156     cudaDeviceSynchronize();
    157 
    158     // 使用 GPU0 读取 GPU1 的全局内存数据,计算并写入 GPU0 的全局内存
    159     printf("
    	Run kernel on GPU%d, reading data from GPU%d and writing to GPU%d
    ", gpuid[0], gpuid[1], gpuid[0]);
    160     cudaSetDevice(gpuid[0]);
    161     SimpleKernel<<<blocks, threads>>>(g1, g0);
    162     cudaDeviceSynchronize();
    163 
    164     // 检查结果
    165     cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);
    166     int error_count = 0;
    167     for (int i=0; i<buf_size / sizeof(float); i++)
    168     {
    169         if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
    170         {
    171             printf("
    	Result error at %i: gpu[i] = %f, cpu[i] = %f
    ", i, h0[i], (float(i%4096)*2.0f*2.0f));
    172             if (error_count++ > 10)
    173                 break;
    174         }
    175     }
    176 
    177     // 关闭 P2P
    178     cudaSetDevice(gpuid[0]);
    179     cudaDeviceDisablePeerAccess(gpuid[1]);
    180     cudaSetDevice(gpuid[1]);
    181     cudaDeviceDisablePeerAccess(gpuid[0]);
    182 
    183     // 回收工作
    184     cudaFreeHost(h0);
    185     cudaSetDevice(gpuid[0]);
    186     cudaFree(g0);
    187     cudaSetDevice(gpuid[1]);
    188     cudaFree(g1);    
    189     cudaEventDestroy(start_event);
    190     cudaEventDestroy(stop_event);
    191     for (int i=0; i<gpu_n; i++)
    192         cudaSetDevice(i);
    193     printf("
    	%s!
    ",error_count?"Test failed": "Test passed");
    194 
    195     getchar();
    196     return 0;
    197 }

    ▶ 输出结果

    只有一台设备,暂无法进行测试

    ▶ 涨姿势:

    ● P2P 要求:至少两台计算能力不低于 2.0 的设备,并支持同一可视内存空间特性;计算环境不低于 CUDA 4.0;Windows 安装 Tesla 计算集群驱动。

    ● 使用P2P的关键步骤

     1 // 检查两台设备之间是否能使用 P2P
     2 int can_access_peer;
     3 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
     4 
     5 // 启用 P2P
     6 cudaSetDevice(gpuid[i]);
     7 cudaDeviceEnablePeerAccess(gpuid[j], 0);
     8 cudaSetDevice(gpuid[j];
     9 cudaDeviceEnablePeerAccess(gpuid[i], 0);
    10 
    11 // 设备间传输数据
    12 cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
    13 
    14 // 关闭 P2P
    15 cudaSetDevice(gpuid[i]);
    16 cudaDeviceDisablePeerAccess(gpuid[i]);
    17 cudaSetDevice(gpuid[j]);
    18 cudaDeviceDisablePeerAccess(gpuid[j]);
    19 
    20 // cuda_runtime_api.h
    21 extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
    22 
    23 extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
    24 
    25 extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);

    ● 其他代码中的定义

    1 // helper_string.h
    2 #define EXIT_WAIVED 2
  • 相关阅读:
    2019牛客多校 Round10
    2019牛客多校 Round9
    2019牛客多校 Round8
    2019牛客多校 Round7
    2019HDU多校 Round8
    2019HDU多校 Round7
    elasticsearch分词器
    elasticsearch的mapping
    elasticsearch批量操作
    elasticsearch元数据
  • 原文地址:https://www.cnblogs.com/cuancuancuanhao/p/7894862.html
Copyright © 2011-2022 走看看