zoukankan      html  css  js  c++  java
  • CUDA中使用多维数组

    今天想起一个问题,看到的绝大多数CUDA代码都是使用的一维数组,是否可以在CUDA中使用一维数组,这是一个问题,想了各种问题,各种被77的错误状态码和段错误折磨,最后发现有一个cudaMallocManaged函数,这个函数可以很好的组织多维数组的多重指针的形式

    ,后来发现,这个问题之前在Stack Overflow中就有很好的解决。先贴一下我自己的代码实现:

      1 #include "cuda_runtime.h"  
      2 #include "device_launch_parameters.h"  
      3   
      4 #include <stdio.h>  
      5 const int arraySize = 5; 
      6 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
      7   
      8 __global__ void addKernel(int **c, const int *a, const int *b)  
      9 {  
     10     int i = threadIdx.x;  
     11     if(i<arraySize)
     12         c[0][i] = a[i] + b[i];  
     13     else
     14         c[1][i-arraySize]= a[i-arraySize]+b[i-arraySize];
     15 }  
     16 
     17 int main()  
     18 {  
     19  
     20     const int a[arraySize] = { 1, 2, 3, 4, 5 };  
     21     const int b[arraySize] = { 10, 20, 30, 40, 50 };  
     22     int c[arraySize] = { 0 };  
     23   
     24     // Add vectors in parallel.  
     25     cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);  
     26     if (cudaStatus != cudaSuccess) {  
     27         fprintf(stderr, "addWithCuda failed!");  
     28         return 1;  
     29     }  
     30   
     31     printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}
    ",  
     32         c[0], c[1], c[2], c[3], c[4]);  
     33   
     34     // cudaThreadExit must be called before exiting in order for profiling and  
     35     // tracing tools such as Nsight and Visual Profiler to show complete traces.  
     36     cudaStatus = cudaThreadExit();  
     37     if (cudaStatus != cudaSuccess) {  
     38         fprintf(stderr, "cudaThreadExit failed!");  
     39         return 1;  
     40     }  
     41   
     42     return 0;  
     43 }  
     44   
     45 // Helper function for using CUDA to add vectors in parallel.  
     46 cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
     47 {  
     48     int *dev_a = 0;  
     49     int *dev_b = 0;  
     50     int *dev_c0;
     51     int **dev_c ;
     52     cudaError_t cudaStatus;  
     53   
     54     // Choose which GPU to run on, change this on a multi-GPU system.  
     55     cudaStatus = cudaSetDevice(0);  
     56     if (cudaStatus != cudaSuccess) {  
     57         fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
     58         goto Error;  
     59     }  
     60   
     61     // Allocate GPU buffers for three vectors (two input, one output)    
     62     cudaStatus  =  cudaMallocManaged(&dev_c, 2*sizeof(int*));
     63     if (cudaStatus != cudaSuccess) {  
     64         fprintf(stderr, "cudaMalloc failed!");  
     65         goto Error;  
     66     } 
     67     cudaStatus = cudaMalloc((void**)&(dev_c0), size * sizeof(int)*2);  
     68     if (cudaStatus != cudaSuccess) {  
     69         fprintf(stderr, "cudaMalloc failed!");  
     70         goto Error;  
     71     }  
     72 
     73     dev_c[0]=dev_c0;
     74     dev_c[1]=dev_c0+arraySize;
     75     cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
     76     if (cudaStatus != cudaSuccess) {  
     77         fprintf(stderr, "cudaMalloc failed!");  
     78         goto Error;  
     79     }  
     80   
     81     cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
     82     if (cudaStatus != cudaSuccess) {  
     83         fprintf(stderr, "cudaMalloc failed!");  
     84         goto Error;  
     85     }  
     86   
     87     // Copy input vectors from host memory to GPU buffers.  
     88     cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
     89     if (cudaStatus != cudaSuccess) {  
     90         fprintf(stderr, "cudaMemcpy failed!");  
     91         goto Error;  
     92     }  
     93   
     94     cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
     95     if (cudaStatus != cudaSuccess) {  
     96         fprintf(stderr, "cudaMemcpy failed!");  
     97         goto Error;  
     98     }  
     99   
    100     // Launch a kernel on the GPU with one thread for each element.  
    101     addKernel<<<1, size*2>>>(dev_c, dev_a, dev_b);  
    102   
    103     // cudaThreadSynchronize waits for the kernel to finish, and returns  
    104     // any errors encountered during the launch.  
    105     cudaStatus = cudaThreadSynchronize();  
    106     if (cudaStatus != cudaSuccess) {  
    107         fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!
    ", cudaStatus);  
    108         goto Error;  
    109     }  
    110   
    111     // Copy output vector from GPU buffer to host memory.  
    112     cudaStatus = cudaMemcpy(c, dev_c[1], size * sizeof(int), cudaMemcpyDeviceToHost);  
    113     if (cudaStatus != cudaSuccess) {  
    114         fprintf(stderr, "cudaMemcpy failed!");  
    115         goto Error;  
    116     }  
    117   
    118 Error:  
    119 
    120     cudaFree(dev_a);  
    121     cudaFree(dev_b);  
    122       
    123     return cudaStatus;  
    124 }  
    View Code

    在别人很好的代码下就不漏丑了,贴一下stack overflow的代码,非常直观易懂

     1 //https://stackoverflow.com/questions/40388242/multidimensional-array-allocation-with-cuda-unified-memory-on-power-8
     2  
     3 #include <iostream>
     4 #include <assert.h>
     5  
     6 template<typename T>
     7 T**** create_4d_flat(int a, int b, int c, int d) {
     8     T *base;
     9     cudaError_t err = cudaMallocManaged(&base, a*b*c*d * sizeof(T));
    10     assert(err == cudaSuccess);
    11     T ****ary;
    12     err = cudaMallocManaged(&ary, (a + a * b + a * b*c) * sizeof(T*));
    13     assert(err == cudaSuccess);
    14     for (int i = 0; i < a; i++) {
    15         ary[i] = (T ***)((ary + a) + i * b);
    16         for (int j = 0; j < b; j++) {
    17             ary[i][j] = (T **)((ary + a + a * b) + i * b*c + j * c);
    18             for (int k = 0; k < c; k++)
    19                 ary[i][j][k] = base + ((i*b + j)*c + k)*d;
    20         }
    21     }
    22     return ary;
    23 }
    24  
    25 template<typename T>
    26 void free_4d_flat(T**** ary) {
    27     if (ary[0][0][0]) cudaFree(ary[0][0][0]);
    28     if (ary) cudaFree(ary);
    29 }
    30  
    31  
    32 template<typename T>
    33 __global__ void fill(T**** data, int a, int b, int c, int d) {
    34     unsigned long long int val = 0;
    35     for (int i = 0; i < a; i++)
    36         for (int j = 0; j < b; j++)
    37             for (int k = 0; k < c; k++)
    38                 for (int l = 0; l < d; l++)
    39                     data[i][j][k][l] = val++;
    40 }
    41  
    42 void report_gpu_mem()
    43 {
    44     size_t free, total;
    45     cudaMemGetInfo(&free, &total);
    46     std::cout << "Free = " << free << " Total = " << total << std::endl;
    47 }
    48  
    49 int main() {
    50     report_gpu_mem();
    51  
    52     unsigned long long int ****data2;
    53     std::cout << "allocating..." << std::endl;
    54     data2 = create_4d_flat<unsigned long long int>(64, 63, 62, 5);
    55  
    56     report_gpu_mem();
    57  
    58     fill << <1, 1 >> > (data2, 64, 63, 62, 5);
    59     cudaError_t err = cudaDeviceSynchronize();
    60     assert(err == cudaSuccess);
    61  
    62     std::cout << "validating..." << std::endl;
    63     for (int i = 0; i < 64 * 63 * 62 * 5; i++)
    64         if (*(data2[0][0][0] + i) != i) { std::cout << "mismatch at " << i << " was " << *(data2[0][0][0] + i) << std::endl; return -1; }
    65     free_4d_flat(data2);
    66     return 0;
    67 }
  • 相关阅读:
    Laya list 居中
    Laya for...in和for each...in
    idea中tomcat启动失败

    通过Mock测试控制器
    Mybatis查询sql传入一个字符串传参数,报There is no getter for property named 'ids' in 'class java.lang.String'。
    Application Server was not connected before run configuration stop, reason: Unable to ping server at localhost:1099
    System.getProperty("XXX")方法
    linux命令
    JDK,IDEA,Tomcat,maven,MySQL的安装
  • 原文地址:https://www.cnblogs.com/jourluohua/p/10765685.html
Copyright © 2011-2022 走看看