zoukankan      html  css  js  c++  java
  • CUDA实战3

    1.两个一维数组相加,求和

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 void     initial(float *list,int size)
      7 {
      8     float *num = list;
      9     //srand((unsigned)time(NULL));
     10     for (int i=0; i<size; i++)
     11     {
     12         num[i] = rand()%10;
     13     }
     14     
     15 }
     16 void sumMatrix(float* MatA, float* MatB,float *MatC,int nx,int ny)
     17 {
     18     float* a=MatA;
     19     float* b=MatB;
     20     float* c=MatC;
     21     for(int j=0; j<ny;j++)
     22     {
     23         for(int i=0; i<nx;i++)
     24         {
     25             c[i] = a[i] + b[i];
     26         }
     27         c += nx;
     28         b += nx;
     29         a += nx;
     30     }
     31 }
     32 
     33 
     34 //核函数
     35 __global__ void GPUsumMatrix(float* MatA,float* MatB,float* MatC,int nx,int ny)
     36 {
     37     int ix = threadIdx.x + blockDim.x * blockIdx.x;
     38     int iy = threadIdx.y + blockDim.y * blockIdx.y;
     39     int idx = ix + iy * ny;
     40     if(ix<nx && iy<ny)
     41     {
     42         MatC[idx] = MatA[idx] + MatB[idx];
     43     //    printf("
      C: %f 
    ",MatC[idx]);
     44     }
     45 } 
     46 
     47 void printList(float* A,int size)
     48 {
     49     for (int i=0;i<size;i++)
     50     {
     51         printf("  %f   ",A[i]);
     52     }
     53 }
     54 int main(int argc, char** argv)
     55 {
     56      //CPU计时方法
     57     float time_cpu, time_gpu;
     58     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     59     //GPU计时方法
     60     float time_CPU, time_GPU;
     61     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     62 
     63     //输入二维矩阵
     64     int nx = 1<<12;
     65     int ny = 1<<12;
     66     int nBytes = nx * ny *sizeof(float);
     67     //开辟主机内存
     68     float *A_host = (float*)malloc(nBytes);
     69     float *B_host = (float*)malloc(nBytes);
     70     float *C_host = (float*)malloc(nBytes); 
     71     float *C_from_gpu = (float*)malloc(nBytes);
     72     initial(A_host,nx*ny);
     73     printf("A_host is:");
     74 //    printList(A_host,nx*ny);
     75     initial(B_host,nx*ny);
     76     printf("
    B_host is:");
     77 //    printList(B_host,nx*ny);
     78     
     79   
     80    //记录当前时间
     81     start_cpu = clock();
     82     sumMatrix(A_host,B_host,C_host,nx,ny);
     83     stop_cpu = clock();
     84     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
     85     
     86     //输出结果
     87     printf(" CPU result is :
    ");
     88 //    printList(C_host,nx*ny);
     89     
     90     //开辟设备内存
     91     float* A_dev = NULL;
     92     float* B_dev = NULL;
     93     float* C_dev = NULL;
     94     cudaMalloc((void**)&A_dev,nBytes);
     95     cudaMalloc((void**)&B_dev,nBytes);
     96     cudaMalloc((void**)&C_dev,nBytes);
     97     
     98     //输入数据,从hostTO device
     99     cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    100     cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice); 
    101     dim3 block(2,2);
    102     dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
    103     // 创建Event
    104     cudaEventCreate(&start_GPU);
    105     cudaEventCreate(&stop_GPU);
    106    //记录当前时间
    107     cudaEventRecord(start_GPU,0);
    108     start_gpu = clock();
    109     
    110     GPUsumMatrix<<<grid,block>>>(A_dev,B_dev,C_dev,nx,ny); 
    111     sumMatrix(A_host,B_host,C_host,nx,ny);
    112   
    113     stop_gpu = clock();
    114     cudaEventRecord(stop_GPU,0);
    115     cudaEventSynchronize(start_GPU);
    116     cudaEventSynchronize(stop_GPU);
    117     //计算时间差
    118     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    119     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    120     //消除Event
    121     cudaEventDestroy(start_GPU);
    122     cudaEventDestroy(stop_GPU);
    123     cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost); 
    124     //输出结果
    125     printf(" GPU result is :
    ");
    126 //    printList(C_from_gpu,nx*ny);
    127     
    128     cudaFree(A_dev);
    129     cudaFree(B_dev);
    130     cudaFree(C_dev);
    131     free(A_host);
    132     free(B_host);
    133     free(C_host);
    134     
    135      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    136      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    137      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    138      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    139      
    140     
    141     cudaDeviceReset();
    142     return 0;
    143 }

     2.

    由于加法的交换律和结合律,数组可以以任意顺序求和。所以我们会自然而然产生这样的思路:首先把输入数组划分为更小的数据块,之后用一个线程计算一个数据块的部分和,最后把所有部分和再求和得出最终结果。

    依照以上的思路,我们可以按下图这样计算。就上面的输入例子而言,首先需要我们开辟一个8个int的存储空间,图中一行的数据代表我们开辟的存储空间。计算时首先将相邻的两数相加(也称相邻配对),结果写入第一个数的存储空间内。第二轮迭代时我们再将第一次的结果两两相加得出下一级结果,一直重复这个过程最后直到我们得到最终的结果,空白方框里面存储的内容是我们不需要的。这个过程相当于,每一轮迭代后,选取被加数的跨度翻倍,后面我们核函数就是这样实现的。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 void     initial(float *list,int size)
      7 {
      8     float *num = list;
      9     //srand((unsigned)time(NULL));
     10     for (int i=0; i<size; i++)
     11     {
     12         num[i] = rand()%10;
     13     }
     14     
     15 }
     16 void sumMatrix(float* MatA, float* MatB,int size)
     17 {
     18     float* a=MatA;
     19     float* b=MatB;
     20     int i = 0;
     21     for(int j=0; j<size;j++)
     22     {
     23         b[i] += a[j];
     24     }
     25 
     26 }
     27 
     28 
     29 //核函数
     30 __global__ void GPUreduceNeighbored(float* g_idata,float* g_odata, unsigned int n)
     31 {
     32     //set thread ID
     33     unsigned int tid = threadIdx.x;
     34     if(tid >= n) return;
     35     float *idata = g_idata + blockIdx.x * blockDim.x;
     36     for(int stride = 1; stride < blockDim.x; stride*=2)
     37     {
     38         if((tid%(2*stride))==0)
     39         {
     40             idata[tid] += idata[tid+stride];
     41         }
     42         __syncthreads();
     43     }
     44     if(tid == 0)
     45     {
     46         g_odata[blockIdx.x] = idata[0];
     47     }
     48     
     49 }
     50 void printList(float* A,int size)
     51 {
     52     for (int i=0;i<size;i++)
     53     {
     54         printf("  %f   ",A[i]);
     55     }
     56 }
     57 int main(int argc, char** argv)
     58 {
     59      //CPU计时方法
     60     float time_cpu, time_gpu;
     61     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     62     //GPU计时方法
     63     float time_CPU, time_GPU;
     64     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     65 
     66     //输入一维数组 
     67     int size = 1<<24;
     68 
     69     int nBytes = size *sizeof(float);
     70     //开辟主机内存
     71     float *A_host = (float*)malloc(nBytes);
     72     float *B_host = (float*)malloc(nBytes);
     73     float *C_from_gpu = (float*)malloc(nBytes);
     74     
     75     initial(A_host,size);
     76     printf("A_host is:");
     77   //    printList(A_host,size);
     78     
     79     
     80    //记录当前时间
     81     start_cpu = clock();
     82  
     83     sumMatrix(A_host,B_host,size);
     84    
     85     stop_cpu = clock(); 
     86     
     87     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
     88 
     89     //输出结果
     90     printf(" CPU result is :
    ");
     91 //    printList(B_host,1);
     92     
     93     //开辟设备内存
     94     float* A_dev = NULL;
     95     float* B_dev = NULL;
     96 
     97     cudaMalloc((void**)&A_dev,nBytes);
     98     cudaMalloc((void**)&B_dev,nBytes);
     99 //    cudaMalloc((void**)&C_dev,nBytes);
    100     
    101     //输入数据,从hostTO device
    102     cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    103     //cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice); 
    104     dim3 block(1024,1);
    105     dim3 grid((size-1)/block.x+1,1);
    106     // 创建Event
    107     cudaEventCreate(&start_GPU);
    108     cudaEventCreate(&stop_GPU);
    109    //记录当前时间
    110     cudaEventRecord(start_GPU,0);
    111     start_gpu = clock();
    112     
    113     GPUreduceNeighbored<<<grid,block>>>(A_dev,B_dev,size); 
    114   
    115     stop_gpu = clock();
    116     cudaEventRecord(stop_GPU,0);
    117     cudaEventSynchronize(start_GPU);
    118     cudaEventSynchronize(stop_GPU);
    119     //计算时间差
    120     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    121     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    122     //消除Event
    123     cudaEventDestroy(start_GPU);
    124     cudaEventDestroy(stop_GPU);
    125     cudaMemcpy(C_from_gpu,B_dev,nBytes,cudaMemcpyDeviceToHost); 
    126     //输出结果
    127     printf(" GPU result is :
    ");
    128 //    printList(C_from_gpu,1);
    129     
    130     cudaFree(A_dev);
    131     cudaFree(B_dev);
    132 
    133     free(A_host);
    134     free(B_host);
    135 
    136     
    137      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    138      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    139      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    140      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    141      
    142     
    143     cudaDeviceReset();
    144     return 0;
    145 }

     3.

    线程束分化

    首先我们来回忆一下上个程序的实现思路,我们是利用stride变量来实现每轮迭代时的被加数选择,这造成线程束的分化,也就是每个线程束中只有部分线程是活跃的,但由于硬件设计,调度会以一整个线程束为单位进行,所以影响了程序的效率。

    这种情况下,我们可以通过重新组织线程索引来解决线程束分化的问题。我们把核函数的代码进行修改:

    相比于之前的初版代码,我们现在使用线程ID来生成一个数组访问索引,这种方式有效地避免了线程束的分化。可能会有点迷惑,我们更具体说说,在每个线程块有1024个线程(32个线程束)时,在第一轮迭代,前16个线程束执行计算,后16个线程束什么都不做;而原来的代码中,32个线程束都执行计算,但只有偶数标号的线程活跃。第二轮迭代时,前8个线程束执行计算,后24个线程束执行计算;而原来的代码中,32个线程束都执行计算,但只有标号是4的倍数的线程活跃。这样重新组织线程ID后,线程束分化就被避免掉了。我们来实际运行一下,看看效果。

    可见尽量避免线程束的分化十分重要。这给了我们一点的启发,看似不起眼的细节,在CUDA编程中却会产生不小的影响,这也是我们需要了解底层硬件运行机制的一个重要原因。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 void     initial(float *list,int size)
      7 {
      8     float *num = list;
      9     //srand((unsigned)time(NULL));
     10     for (int i=0; i<size; i++)
     11     {
     12         num[i] = rand()%10;
     13     }
     14     
     15 }
     16 void sumMatrix(float* MatA, float* MatB,int size)
     17 {
     18     float* a=MatA;
     19     float* b=MatB;
     20     int i = 0;
     21     for(int j=0; j<size;j++)
     22     {
     23         b[i] += a[j];
     24     }
     25 
     26 }
     27 
     28 
     29 //核函数
     30 __global__ void GPUreduceNeighbored(float* g_idata,float* g_odata, unsigned int n)
     31 {
     32       unsigned int tid = threadIdx.x;
     33     unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
     34     // convert global data pointer to the local point of this block
     35     float *idata = g_idata + blockIdx.x*blockDim.x;
     36     if (idx > n)
     37         return;
     38     //in-place reduction in global memory
     39     for (int stride = 1; stride < blockDim.x; stride *= 2)
     40     {
     41         //convert tid into local array index
     42         int index = 2 * stride *tid;
     43         if (index < blockDim.x)
     44         {
     45             idata[index] += idata[index + stride];
     46         }
     47         __syncthreads();
     48     }
     49     //write result for this block to global men
     50     if (tid == 0)
     51         g_odata[blockIdx.x] = idata[0];
     52     
     53 }
     54 void printList(float* A,int size)
     55 {
     56     for (int i=0;i<size;i++)
     57     {
     58         printf("  %f   ",A[i]);
     59     }
     60 }
     61 int main(int argc, char** argv)
     62 {
     63      //CPU计时方法
     64     float time_cpu, time_gpu;
     65     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     66     //GPU计时方法
     67     float time_CPU, time_GPU;
     68     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     69 
     70     //输入一维数组 
     71     int size = 1<<24;
     72 
     73     int nBytes = size *sizeof(float);
     74     //开辟主机内存
     75     float *A_host = (float*)malloc(nBytes);
     76     float *B_host = (float*)malloc(nBytes);
     77     float *C_from_gpu = (float*)malloc(nBytes);
     78     
     79     initial(A_host,size);
     80     printf("A_host is:");
     81   //    printList(A_host,size);
     82     
     83    
     84    //记录当前时间
     85     start_cpu = clock();
     86    
     87     sumMatrix(A_host,B_host,size);
     88    
     89     stop_cpu = clock(); 
     90     
     91     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
     92     //输出结果
     93     printf(" CPU result is :
    ");
     94 //    printList(B_host,1);
     95     
     96     //开辟设备内存
     97     float* A_dev = NULL;
     98     float* B_dev = NULL;
     99 
    100     cudaMalloc((void**)&A_dev,nBytes);
    101     cudaMalloc((void**)&B_dev,nBytes);
    102 //    cudaMalloc((void**)&C_dev,nBytes);
    103     
    104     //输入数据,从hostTO device
    105     cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    106     //cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice); 
    107     dim3 block(1024,1);
    108     dim3 grid((size-1)/block.x+1,1);
    109     // 创建Event
    110     cudaEventCreate(&start_GPU);
    111     cudaEventCreate(&stop_GPU);
    112    //记录当前时间
    113     cudaEventRecord(start_GPU,0);
    114     start_gpu = clock();
    115     
    116     GPUreduceNeighbored<<<grid,block>>>(A_dev,B_dev,size); 
    117   
    118     stop_gpu = clock();
    119     cudaEventRecord(stop_GPU,0);
    120     cudaEventSynchronize(start_GPU);
    121     cudaEventSynchronize(stop_GPU);
    122     //计算时间差
    123     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    124     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    125     //消除Event
    126     cudaEventDestroy(start_GPU);
    127     cudaEventDestroy(stop_GPU);
    128     cudaMemcpy(C_from_gpu,B_dev,nBytes,cudaMemcpyDeviceToHost); 
    129     //输出结果
    130     printf(" GPU result is :
    ");
    131 //    printList(C_from_gpu,1);
    132     
    133     cudaFree(A_dev);
    134     cudaFree(B_dev);
    135 
    136     free(A_host);
    137     free(B_host);
    138 
    139     
    140      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    141      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    142      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    143      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    144      
    145     
    146     cudaDeviceReset();
    147     return 0;
    148 }

     4.

    内存组织

    在CPU编程中,我们学过空间局部性与时间局部性,这在CUDA编程中对我们也是很有启发的。之前我们采用相邻配对进行求和,这种方法最为直观,但会造成在第二轮之后内存访问的不连续(因为使用了stride变量作为跨度)。为了缓解这种现象,我们重新组织一下配对方法,让对内存的访问更加集中,如下图,这种新的配对方法我们称为交错配对。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 void     initial(float *list,int size)
      7 {
      8     float *num = list;
      9     //srand((unsigned)time(NULL));
     10     for (int i=0; i<size; i++)
     11     {
     12         num[i] = rand()%10;
     13     }
     14     
     15 }
     16 void sumMatrix(float* MatA, float* MatB,int size)
     17 {
     18     float* a=MatA;
     19     float* b=MatB;
     20     int i = 0;
     21     for(int j=0; j<size;j++)
     22     {
     23         b[i] += a[j];
     24     }
     25 
     26 }
     27 
     28 
     29 //核函数
     30 __global__ void GPUreduceNeighbored(float* g_idata,float* g_odata, unsigned int n)
     31 {
     32     unsigned int tid = threadIdx.x;
     33     unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
     34     // convert global data pointer to the local point of this block
     35     float *idata = g_idata + blockIdx.x*blockDim.x;
     36     if (idx >= n)
     37         return;
     38     //in-place reduction in global memory
     39     for (int stride = blockDim.x/2; stride >0; stride >>=1)
     40     {
     41 
     42         if (tid <stride)
     43         {
     44             idata[tid] += idata[tid + stride];
     45         }
     46         __syncthreads();
     47     }
     48     //write result for this block to global men
     49     if (tid == 0)
     50         g_odata[blockIdx.x] = idata[0];
     51     
     52 }
     53 void printList(float* A,int size)
     54 {
     55     for (int i=0;i<size;i++)
     56     {
     57         printf("  %f   ",A[i]);
     58     }
     59 }
     60 int main(int argc, char** argv)
     61 {
     62      //CPU计时方法
     63     float time_cpu, time_gpu;
     64     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     65     //GPU计时方法
     66     float time_CPU, time_GPU;
     67     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     68 
     69     //输入一维数组 
     70     int size = 1<<24;
     71 
     72     int nBytes = size *sizeof(float);
     73     //开辟主机内存
     74     float *A_host = (float*)malloc(nBytes);
     75     float *B_host = (float*)malloc(nBytes);
     76     float *C_from_gpu = (float*)malloc(nBytes);
     77     
     78     initial(A_host,size);
     79     printf("A_host is:");
     80   //    printList(A_host,size);
     81     
     82     
     83    //记录当前时间
     84     start_cpu = clock();
     85    
     86     sumMatrix(A_host,B_host,size);
     87    
     88     stop_cpu = clock(); 
     89   
     90     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
     91     
     92     //输出结果
     93     printf(" CPU result is :
    ");
     94 //    printList(B_host,1);
     95     
     96     //开辟设备内存
     97     float* A_dev = NULL;
     98     float* B_dev = NULL;
     99 
    100     cudaMalloc((void**)&A_dev,nBytes);
    101     cudaMalloc((void**)&B_dev,nBytes);
    102 //    cudaMalloc((void**)&C_dev,nBytes);
    103     
    104     //输入数据,从hostTO device
    105     cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    106     //cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice); 
    107     dim3 block(1024,1);
    108     dim3 grid((size-1)/block.x+1,1);
    109     // 创建Event
    110     cudaEventCreate(&start_GPU);
    111     cudaEventCreate(&stop_GPU);
    112    //记录当前时间
    113     cudaEventRecord(start_GPU,0);
    114     start_gpu = clock();
    115     
    116     GPUreduceNeighbored<<<grid,block>>>(A_dev,B_dev,size); 
    117   
    118     stop_gpu = clock();
    119     cudaEventRecord(stop_GPU,0);
    120     cudaEventSynchronize(start_GPU);
    121     cudaEventSynchronize(stop_GPU);
    122     //计算时间差
    123     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    124     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    125     //消除Event
    126     cudaEventDestroy(start_GPU);
    127     cudaEventDestroy(stop_GPU);
    128     cudaMemcpy(C_from_gpu,B_dev,nBytes,cudaMemcpyDeviceToHost); 
    129     //输出结果
    130     printf(" GPU result is :
    ");
    131 //    printList(C_from_gpu,1);
    132     
    133     cudaFree(A_dev);
    134     cudaFree(B_dev);
    135 
    136     free(A_host);
    137     free(B_host);
    138 
    139     
    140      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    141      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    142      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    143      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    144      
    145     
    146     cudaDeviceReset();
    147     return 0;
    148 }

    这给我们的启发是,对全局内存的访问要尽量进行合并访问与存储,这样才能达到最大的带宽。

    程序从2.48ms加速到1.36ms,累计获得了1.8倍的加速比。而这两点,恰恰对应了系列开篇中我们提到的CUDA编程的特点,线程组织和内存组织,这也是我们在CUDA编程中需要随时注意的。

    5.

    展开循环以隐藏延时

    (1)现在使用高级语言编程时,我们已经不会刻意去进行循环展开了,因为这件事会由编译器帮我们完成。但在CUDA编程中,循环展开具有很重要的意义,它能给线程束调度器提供更多可用的线程束,以帮助我们有效地隐藏延时。

    于是我们可以使用循环展开的方法,对并行归约的程序再来一波优化。

    我们之前只是用一个线程块来处理一个小数组,我们称其为一个数据块。如果我们使用一个线程块手动展开两个数据块的处理,那会怎样呢?先给个结论,这样通过减少指令消耗和增加更多的独立调度指令,更多的并发操作被添加到流水线上,以产生了更高的指令和内存带宽。反映在宏观上就是程序执行的总体时间变少了。

    (2)从概念上来讲,可以把它作为归约循环的一个迭代,此循环可在数据块间进行归约。

    如果每个线程处理两个数据块,那么我们需要的线程块总量会变为原来的一半,因此主函数也要对应修改。

    看上去,这样处理后线程块减少了,与我们之前要使用尽量多线程块的理论不符。但实际我们通过这种方式,让一个线程中有更多的独立内存加载/存储操作,这样可以更好地隐藏内存延时,更好地使用设备内存读取吞吐量的指标,以产生更好的性能。所以我们编程时,各个策略要针对实际情况结合使用。

    
    

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    
    


    void initial(int *list,int size)
    {
    int *num = list;
    for (int i=0; i<size; i++)
    {
    num[i] = rand()%10;
    }

    }

    
    

    void sumMatrix(int* MatA, int* MatB,int size)
    {
    int* a=MatA;
    int* b=MatB;
    int i = 0;
    b[0] = 0;
    for(int j=0; j<size;j++)
    {
    b[i] += a[j];
    }

    
    

    }

    
    


    //核函数
    __global__ void GPUreduceNeighbored(int* g_idata,int* g_odata, unsigned int n)
    {
    //set thread ID
    unsigned int tid = threadIdx.x;
    unsigned int idx = blockDim.x*blockIdx.x*2+threadIdx.x;

    
    

    if (tid >= n) return;

    
    

    int *idata = g_idata + blockIdx.x*blockDim.x*2;
    if(idx+blockDim.x<n)
    {
    g_idata[idx]+=g_idata[idx+blockDim.x];

    
    

    }
    __syncthreads();

    
    

    for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
    {
    if (tid <stride)
    {
    idata[tid] += idata[tid + stride];
    }
    __syncthreads();
    }
    if (tid == 0)
    g_odata[blockIdx.x] = idata[0];

    }

    
    

    void printList(int* A,int size)
    {
    for (int i=0;i<size;i++)
    {
    printf(" %d ",A[i]);
    }
    }

    
    

    int main(int argc, char** argv)
    {
    //CPU计时方法
    float time_cpu, time_gpu;
    clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
    //GPU计时方法
    float time_CPU, time_GPU;
    cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;

    
    

    //输入一维数组
    int size = 1<<24;
    size_t nBytes = size * sizeof(int);
    //开辟主机内存
    int *A_host = (int*)malloc(nBytes);
    int *B_host = (int*)malloc(nBytes);
    int *C_from_gpu = (int*)malloc(nBytes);

    initial(A_host,size);
    printf("A_host is:");

    
    

    start_cpu = clock();

    sumMatrix(A_host,B_host,size);

    stop_cpu = clock();

    printf(" The time from CPU: %f(ms) ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);

    
    

    printf(" CPU result is : ");
    printList(B_host,1);

    int blocksize = 1024;
    dim3 block(blocksize, 1);
    dim3 grid((size - 1) / block.x + 1, 1);
    printf(" grid %d block %d ", grid.x, block.x);

    int* idata_dev = NULL;
    int* odata_dev = NULL;
    cudaMalloc((void**)&idata_dev,nBytes);
    cudaMalloc((void**)&odata_dev,grid.x * sizeof(int));
    int gpu_sum=0;
    cudaMemcpy(idata_dev,A_host,nBytes,cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    // 创建Event
    cudaEventCreate(&start_GPU);
    cudaEventCreate(&stop_GPU);
    //记录当前时间
    cudaEventRecord(start_GPU,0);

    start_gpu = clock();

    GPUreduceNeighbored<<<grid.x/2,block>>>(idata_dev,odata_dev,size);

    stop_gpu = clock();

    cudaEventRecord(stop_GPU,0);
    cudaEventSynchronize(start_GPU);
    cudaEventSynchronize(stop_GPU);
    cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    printf(" The time from GPU: %f(ms) ", time_GPU/1000);
    cudaDeviceSynchronize();
    cudaEventDestroy(start_GPU);
    cudaEventDestroy(stop_GPU);

    cudaMemcpy(C_from_gpu,odata_dev,grid.x * sizeof(int),cudaMemcpyDeviceToHost);
    for(int i=0;i<grid.x/2;i++)
    {
    gpu_sum += C_from_gpu[i];
    }
    printf(" GPU result is :%d ",gpu_sum);
    // printList(C_from_gpu,1);

    cudaFree(idata_dev);
    cudaFree(odata_dev);
    free(A_host);
    free(B_host);
    time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    printf(" The time for CPU by host: %f(ms) ", time_cpu);
    printf("The time for GPU by host: %f(ms) ", time_gpu);

    cudaDeviceReset();
    return 0;
    }

    
    

    既然一个线程块处理2个数据块能获得这么高的加速比,那么处理4个,8个呢?

    随着处理数据块数量的增多,处理时间不断降低。不过随着设备内存吞吐量逐渐到达极限,这个时间就不会继续降低了。

    总结一下,为了隐藏延时,我们需要合理地增加一个线程块中需要处理的数据量,以便线程束调度器进行调度。

     6.

    展开线程

    我们可以再进一步想想,当执行到最后几次迭代时,当只需要32个或更少线程时,每次迭代后还需要进行线程束同步。为了加速,我们可以把这最后6次迭代进行展开,使用下面的语句:

    if(tid<32)
        {
            volatile int *vmem = idata;
            vmem[tid]+=vmem[tid+32];
            vmem[tid]+=vmem[tid+16];
            vmem[tid]+=vmem[tid+8];
            vmem[tid]+=vmem[tid+4];
            vmem[tid]+=vmem[tid+2];
            vmem[tid]+=vmem[tid+1];
        }

    这样就避免了执行控制循环逻辑和线程同步逻辑的时间。

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 
      7 
      8 void     initial(int *list,int size)
      9 {
     10     int *num = list;
     11     for (int i=0; i<size; i++)
     12     {
     13         num[i] = rand()%10;
     14     }
     15     
     16 }
     17 
     18 void sumMatrix(int* MatA, int* MatB,int size)
     19 {
     20     int* a=MatA;
     21     int* b=MatB;
     22     int i = 0;
     23     b[0] = 0;
     24     for(int j=0; j<size;j++)
     25     {
     26         b[i] += a[j];
     27     }
     28 
     29 }
     30 
     31 
     32 //核函数
     33 __global__ void GPUreduceNeighbored(int* g_idata,int* g_odata, unsigned int n)
     34 {
     35    //set thread ID
     36     unsigned int tid = threadIdx.x;
     37     unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
     38     //boundary check
     39     if (tid >= n) return;
     40     //convert global data pointer to the
     41     int *idata = g_idata + blockIdx.x*blockDim.x*8;
     42     //unrolling 8;
     43     if(idx+7 * blockDim.x<n)
     44     {
     45         int a1=g_idata[idx];
     46         int a2=g_idata[idx+blockDim.x];
     47         int a3=g_idata[idx+2*blockDim.x];
     48         int a4=g_idata[idx+3*blockDim.x];
     49         int a5=g_idata[idx+4*blockDim.x];
     50         int a6=g_idata[idx+5*blockDim.x];
     51         int a7=g_idata[idx+6*blockDim.x];
     52         int a8=g_idata[idx+7*blockDim.x];
     53         g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
     54 
     55     }
     56     __syncthreads();
     57     //in-place reduction in global memory
     58     for (int stride = blockDim.x/2; stride>32; stride >>=1)
     59     {
     60         if (tid <stride)
     61         {
     62             idata[tid] += idata[tid + stride];
     63         }
     64         //synchronize within block
     65         __syncthreads();
     66     }
     67     //write result for this block to global mem
     68     if(tid<32)
     69     {
     70         volatile int *vmem = idata;
     71         vmem[tid]+=vmem[tid+32];
     72         vmem[tid]+=vmem[tid+16];
     73         vmem[tid]+=vmem[tid+8];
     74         vmem[tid]+=vmem[tid+4];
     75         vmem[tid]+=vmem[tid+2];
     76         vmem[tid]+=vmem[tid+1];
     77 
     78     }
     79 
     80     if (tid == 0)
     81         g_odata[blockIdx.x] = idata[0];
     82     
     83 }
     84 
     85 void printList(int* A,int size)
     86 {
     87     for (int i=0;i<size;i++)
     88     {
     89         printf("  %d   ",A[i]);
     90     }
     91 }
     92 
     93 int main(int argc, char** argv)
     94 {
     95      //CPU计时方法
     96     float time_cpu, time_gpu;
     97     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     98     //GPU计时方法
     99     float time_CPU, time_GPU;
    100     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
    101 
    102     //输入一维数组 
    103     int size = 1<<24;
    104     size_t nBytes = size * sizeof(int);
    105     //开辟主机内存
    106     int *A_host = (int*)malloc(nBytes);
    107     int *B_host = (int*)malloc(nBytes);
    108     int *C_from_gpu = (int*)malloc(nBytes);
    109     
    110     initial(A_host,size);
    111     printf("A_host is:");
    112 
    113     start_cpu = clock();
    114    
    115     sumMatrix(A_host,B_host,size);
    116    
    117     stop_cpu = clock(); 
    118     
    119     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
    120     
    121 
    122     printf(" CPU result is :
    ");
    123     printList(B_host,1);
    124     
    125      int blocksize = 1024;
    126     dim3 block(blocksize, 1);
    127     dim3 grid((size - 1) / block.x + 1, 1);
    128     printf("
    grid %d block %d 
    ", grid.x, block.x);
    129    
    130     int* idata_dev = NULL;
    131     int* odata_dev = NULL;
    132     cudaMalloc((void**)&idata_dev,nBytes);
    133     cudaMalloc((void**)&odata_dev,grid.x * sizeof(int));
    134     int gpu_sum=0;
    135     cudaMemcpy(idata_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    136     cudaDeviceSynchronize();
    137     // 创建Event
    138     cudaEventCreate(&start_GPU);
    139     cudaEventCreate(&stop_GPU);
    140    //记录当前时间
    141     cudaEventRecord(start_GPU,0);
    142    
    143     start_gpu = clock();
    144     
    145     GPUreduceNeighbored<<<grid.x/8,block>>>(idata_dev,odata_dev,size); 
    146   
    147     stop_gpu = clock();
    148     
    149     cudaEventRecord(stop_GPU,0);
    150     cudaEventSynchronize(start_GPU);
    151     cudaEventSynchronize(stop_GPU);
    152     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    153     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    154     cudaDeviceSynchronize();
    155     cudaEventDestroy(start_GPU);
    156     cudaEventDestroy(stop_GPU);
    157    
    158     cudaMemcpy(C_from_gpu,odata_dev,grid.x * sizeof(int),cudaMemcpyDeviceToHost); 
    159     for(int i=0;i<grid.x/8;i++)
    160     {
    161         gpu_sum += C_from_gpu[i];    
    162     }
    163     printf(" GPU result is :%d
    ",gpu_sum);
    164     printf("gpu reduceUnrollWarp8 elapsed %lf ms    (grid %d block %d)
    ",
    165          time_GPU/1000, grid.x/8, block.x);
    166    // printList(C_from_gpu,1);
    167     
    168     cudaFree(idata_dev);
    169     cudaFree(odata_dev);
    170     free(A_host);
    171     free(B_host);
    172     time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    173     time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    174     printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    175     printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    176      
    177     cudaDeviceReset();
    178     return 0;
    179 }

     7.

    完全展开

    理论上如果编译时已知一个循环中的迭代次数,就可以把循环完全展开。因为我们的核函数中的循环迭代次数是基于一个线程块维度的,而且一个线程块支持最大1024个线程,所以我们可以将这个循环进行完全展开。核函数代码如下:

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 
      7 
      8 void     initial(int *list,int size)
      9 {
     10     int *num = list;
     11     for (int i=0; i<size; i++)
     12     {
     13         num[i] = rand()%10;
     14     }
     15     
     16 }
     17 
     18 void sumMatrix(int* MatA, int* MatB,int size)
     19 {
     20     int* a=MatA;
     21     int* b=MatB;
     22     int i = 0;
     23     b[0] = 0;
     24     for(int j=0; j<size;j++)
     25     {
     26         b[i] += a[j];
     27     }
     28 
     29 }
     30 
     31 
     32 //核函数
     33 __global__ void GPUreduceNeighbored(int* g_idata,int* g_odata, unsigned int n)
     34 {
     35    //set thread ID
     36     unsigned int tid = threadIdx.x;
     37     unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
     38     //boundary check
     39     if (tid >= n) return;
     40     //convert global data pointer to the
     41     int *idata = g_idata + blockIdx.x*blockDim.x*8;
     42     if(idx+7 * blockDim.x<n)
     43     {
     44         int a1=g_idata[idx];
     45         int a2=g_idata[idx+blockDim.x];
     46         int a3=g_idata[idx+2*blockDim.x];
     47         int a4=g_idata[idx+3*blockDim.x];
     48         int a5=g_idata[idx+4*blockDim.x];
     49         int a6=g_idata[idx+5*blockDim.x];
     50         int a7=g_idata[idx+6*blockDim.x];
     51         int a8=g_idata[idx+7*blockDim.x];
     52         g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
     53 
     54     }
     55     __syncthreads();
     56     //in-place reduction in global memory
     57     if(blockDim.x>=1024 && tid <512)
     58         idata[tid]+=idata[tid+512];
     59     __syncthreads();
     60     if(blockDim.x>=512 && tid <256)
     61         idata[tid]+=idata[tid+256];
     62     __syncthreads();
     63     if(blockDim.x>=256 && tid <128)
     64         idata[tid]+=idata[tid+128];
     65     __syncthreads();
     66     if(blockDim.x>=128 && tid <64)
     67         idata[tid]+=idata[tid+64];
     68     __syncthreads();
     69     //write result for this block to global mem
     70     if(tid<32)
     71     {
     72         volatile int *vmem = idata;
     73         vmem[tid]+=vmem[tid+32];
     74         vmem[tid]+=vmem[tid+16];
     75         vmem[tid]+=vmem[tid+8];
     76         vmem[tid]+=vmem[tid+4];
     77         vmem[tid]+=vmem[tid+2];
     78         vmem[tid]+=vmem[tid+1];
     79 
     80     }
     81 
     82     if (tid == 0)
     83         g_odata[blockIdx.x] = idata[0];
     84     
     85 }
     86 
     87 void printList(int* A,int size)
     88 {
     89     for (int i=0;i<size;i++)
     90     {
     91         printf("  %d   ",A[i]);
     92     }
     93 }
     94 
     95 int main(int argc, char** argv)
     96 {
     97      //CPU计时方法
     98     float time_cpu, time_gpu;
     99     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
    100     //GPU计时方法
    101     float time_CPU, time_GPU;
    102     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
    103 
    104     //输入一维数组 
    105     int size = 1<<24;
    106     size_t nBytes = size * sizeof(int);
    107     //开辟主机内存
    108     int *A_host = (int*)malloc(nBytes);
    109     int *B_host = (int*)malloc(nBytes);
    110     int *C_from_gpu = (int*)malloc(nBytes);
    111     
    112     initial(A_host,size);
    113     printf("A_host is:");
    114 
    115     start_cpu = clock();
    116    
    117     sumMatrix(A_host,B_host,size);
    118    
    119     stop_cpu = clock(); 
    120     
    121     printf("
    The time from CPU:	%f(ms)
    ", (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
    122     
    123 
    124     printf(" CPU result is :
    ");
    125     printList(B_host,1);
    126     
    127      int blocksize = 1024;
    128     dim3 block(blocksize, 1);
    129     dim3 grid((size - 1) / block.x + 1, 1);
    130     printf("
    grid %d block %d 
    ", grid.x, block.x);
    131    
    132     int* idata_dev = NULL;
    133     int* odata_dev = NULL;
    134     cudaMalloc((void**)&idata_dev,nBytes);
    135     cudaMalloc((void**)&odata_dev,grid.x * sizeof(int));
    136     int gpu_sum=0;
    137     cudaMemcpy(idata_dev,A_host,nBytes,cudaMemcpyHostToDevice); 
    138     cudaDeviceSynchronize();
    139     // 创建Event
    140     cudaEventCreate(&start_GPU);
    141     cudaEventCreate(&stop_GPU);
    142    //记录当前时间
    143     cudaEventRecord(start_GPU,0);
    144    
    145     start_gpu = clock();
    146     
    147     GPUreduceNeighbored<<<grid.x/8,block>>>(idata_dev,odata_dev,size); 
    148   
    149     stop_gpu = clock();
    150     
    151     cudaEventRecord(stop_GPU,0);
    152     cudaEventSynchronize(start_GPU);
    153     cudaEventSynchronize(stop_GPU);
    154     cudaEventElapsedTime(&time_GPU, start_GPU,stop_GPU);
    155     printf("
    The time from GPU:	%f(ms)
    ", time_GPU/1000);
    156     cudaDeviceSynchronize();
    157     cudaEventDestroy(start_GPU);
    158     cudaEventDestroy(stop_GPU);
    159    
    160     cudaMemcpy(C_from_gpu,odata_dev,grid.x * sizeof(int),cudaMemcpyDeviceToHost); 
    161     for(int i=0;i<grid.x/8;i++)
    162     {
    163         gpu_sum += C_from_gpu[i];    
    164     }
    165     printf(" GPU result is :%d
    ",gpu_sum);
    166     printf("gpu reduceUnrollWarp8 elapsed %lf ms    (grid %d block %d)
    ",
    167          time_GPU/1000, grid.x/8, block.x);
    168    // printList(C_from_gpu,1);
    169     
    170     cudaFree(idata_dev);
    171     cudaFree(odata_dev);
    172     free(A_host);
    173     free(B_host);
    174     time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    175     time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    176     printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    177     printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    178      
    179     cudaDeviceReset();
    180     return 0;
    181 }

    注:其中并行规约看的是知乎文章:https://zhuanlan.zhihu.com/p/98463812

  • 相关阅读:
    Servlet深层知识
    HTTP协议
    Web开发中的主要概念
    Schema约束
    连接池基础
    Mysql基础3
    JDBC常用接口详解
    Mysql基础2
    Mysql基础1
    使用12c的DBCA创建数据库的时候报错TNS-04404
  • 原文地址:https://www.cnblogs.com/lin1216/p/12682543.html
Copyright © 2011-2022 走看看