zoukankan      html  css  js  c++  java
  • 可移动固定内存测试

    可移动固定内存测试,项目打包下载

      1 #include "../common/book.h"
      2 #include "cuda_runtime.h"
      3 #include "device_launch_parameters.h"
      4 #include "device_functions.h"
      5 #define imin(a,b) (a<b?a:b)
      6 
      7 #define     N    (33*1024*1024)
      8 const int threadsPerBlock = 256;
      9 const int blocksPerGrid =
     10 imin(32, (N / 2 + threadsPerBlock - 1) / threadsPerBlock);
     11 
     12 
     13 __global__ void dot(int size, float *a, float *b, float *c) {
     14     __shared__ float cache[threadsPerBlock];
     15     int tid = threadIdx.x + blockIdx.x * blockDim.x;
     16     int cacheIndex = threadIdx.x;
     17 
     18     float   temp = 0;
     19     while (tid < size) {
     20         temp += a[tid] * b[tid];
     21         tid += blockDim.x * gridDim.x;
     22     }
     23 
     24     // set the cache values
     25     cache[cacheIndex] = temp;
     26 
     27     //块内线程同步
     28     __syncthreads();
     29 
     30     // for reductions, threadsPerBlock must be a power of 2
     31     // because of the following code
     32     int i = blockDim.x / 2;
     33     while (i != 0) {
     34         if (cacheIndex < i)
     35             cache[cacheIndex] += cache[cacheIndex + i];
     36         __syncthreads();
     37         i /= 2;
     38     }
     39 
     40     if (cacheIndex == 0)
     41         c[blockIdx.x] = cache[0];
     42 }
     43 
     44 
     45 struct DataStruct {
     46     int     deviceID;
     47     int     size;
     48     int     offset;
     49     float   *a;
     50     float   *b;
     51     float   returnValue;
     52 };
     53 
     54 unsigned WINAPI routine(void *pvoidData)
     55 //void* routine(void *pvoidData) 
     56 {
     57     DataStruct  *data = (DataStruct*)pvoidData;
     58     //device0上已经调用了这个代码,这里是device为非0才调用
     59     if (data->deviceID != 0) {
     60         HANDLE_ERROR(cudaSetDevice(data->deviceID));
     61         //告诉运行时希望在和这个设备上分配零拷贝内存,不用在设定是否为可移动的,因为在device0中已经设定
     62         HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
     63     }
     64 
     65     int     size = data->size;
     66     float   *a, *b, c, *partial_c;
     67     float   *dev_a, *dev_b, *dev_partial_c;
     68 
     69     // allocate memory on the CPU side
     70     a = data->a;
     71     b = data->b;
     72     partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
     73 
     74     // allocate the memory on the GPU
     75     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));
     76     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));
     77     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
     78         blocksPerGrid*sizeof(float)));
     79 
     80     // offset 'a' and 'b' to where this GPU is gets it data
     81     dev_a += data->offset;
     82     dev_b += data->offset;
     83 
     84     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
     85         dev_partial_c);
     86     // copy the array 'c' back from the GPU to the CPU
     87     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
     88         blocksPerGrid*sizeof(float),
     89         cudaMemcpyDeviceToHost));
     90 
     91     // finish up on the CPU side
     92     c = 0;
     93     for (int i = 0; i<blocksPerGrid; i++) {
     94         c += partial_c[i];
     95     }
     96 
     97     HANDLE_ERROR(cudaFree(dev_partial_c));
     98 
     99     // free memory on the CPU side
    100     free(partial_c);
    101 
    102     data->returnValue = c;
    103     return 0;
    104 }
    105 
    106 
    107 int main(void) {
    108     int deviceCount;
    109     HANDLE_ERROR(cudaGetDeviceCount(&deviceCount));
    110     if (deviceCount < 2) {
    111         printf("We need at least two compute 1.0 or greater "
    112             "devices, but only found %d
    ", deviceCount);
    113         return 0;
    114     }
    115 
    116     cudaDeviceProp  prop;
    117     for (int i = 0; i<2; i++) {
    118         HANDLE_ERROR(cudaGetDeviceProperties(&prop, i));
    119         if (prop.canMapHostMemory != 1) {
    120             printf("Device %d can not map memory.
    ", i);
    121             return 0;
    122         }
    123     }
    124 
    125     float *a, *b;
    126     HANDLE_ERROR(cudaSetDevice(0));
    127     HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
    128     /*
    129     在设置了设备0后,设置了分配内存的类型为cudaHostAllocPortable,
    130     否则只有设备0会将这些分配的内存视为固定内存
    131     只在device0中设定为可移动的
    132     */
    133     HANDLE_ERROR(cudaHostAlloc((void**)&a, N*sizeof(float),
    134         cudaHostAllocWriteCombined |
    135         cudaHostAllocPortable |
    136         cudaHostAllocMapped));
    137     HANDLE_ERROR(cudaHostAlloc((void**)&b, N*sizeof(float),
    138         cudaHostAllocWriteCombined |
    139         cudaHostAllocPortable |
    140         cudaHostAllocMapped));
    141 
    142     // fill in the host memory with data
    143     for (int i = 0; i<N; i++) {
    144         a[i] = i;
    145         b[i] = i * 2;
    146     }
    147 
    148     // prepare for multithread
    149     DataStruct  data[2];
    150     data[0].deviceID = 0;
    151     data[0].offset = 0;
    152     data[0].size = N / 2;
    153     data[0].a = a;
    154     data[0].b = b;
    155 
    156     data[1].deviceID = 1;
    157     data[1].offset = N / 2;
    158     data[1].size = N / 2;
    159     data[1].a = a;
    160     data[1].b = b;
    161 
    162     CUTThread   thread = start_thread(routine, &(data[1]));
    163     routine(&(data[0]));
    164     end_thread(thread);
    165 
    166 
    167     // free memory on the CPU side
    168     HANDLE_ERROR(cudaFreeHost(a));
    169     HANDLE_ERROR(cudaFreeHost(b));
    170 
    171     printf("Value calculated:  %f
    ",
    172         data[0].returnValue + data[1].returnValue);
    173 
    174     return 0;
    175 }
  • 相关阅读:
    js window对象
    bootstrap之输入框组
    安卓开发学习笔记(一):如何用Android Stuidio导出apk文件?
    达观杯文本智能处理挑战赛 练手代码实现
    kali linux学习笔记(四) : 网络端口大全介绍
    kali linux 网络渗透测试学习笔记(三)社会工程学之Java攻击:钓鱼网站制作
    Java提高篇(三):内部类和匿名内部类
    Java提高篇(二):IO字节流、字符流和处理流
    Java提高篇(一):区分引用变量与对象
    kali linux 网络渗透测试学习笔记(二)OWASP ZAP工具扫描SQL injection漏洞失败
  • 原文地址:https://www.cnblogs.com/liangliangdetianxia/p/3998781.html
Copyright © 2011-2022 走看看