zoukankan      html  css  js  c++  java
  • CUDA学习笔记二

    一、线性

      线性的组织形式对程序的性能是非常重要的。

      

      gridDim.x  线程网格X维度上线程块

      gridDim.y 线程网格Y维度上线程块的数量

      blockDim.x 一个线程块X维度上的线程数量

      blockDim.y 一个线程块Y维度上的线程数量

      blockIdx.x 线程网格X维度上的线程块索引

      blockIdx.y 线程网格Y维度上的线程块索引

      threadIdx.x 线程块X维度上的线程索引

      threadIdx.y 线程块Y维度上的线程索引

    二、线程索引

      通常,一个矩阵以线性存储在global memory 中,并且以行来实现线性:

      

      在kernel中,线程的唯一索引非常有用,为了确定一个线程的索引,我们以2D为例:

        • 线程和block索引

        • 矩阵中元素坐标

        • 线性global memory的偏移

           1. 将thread和block索引映射到矩阵坐标

        ix = thredIdx.x  +  blockIdx.x * blockDim.x;

        iy = threadIdx.y + blockIdx.y * blockDim.y;

      2.利用上述的变量计算线性地址:

        idx = iy *  nx + ix

      

      上图t是block 和 thread索引, 矩阵坐标以及线性地址之间的关系。

      相邻的thread拥有连续的threadIdx.x,即索引为(0,0)(1,0)(2,0)...的thread连续

            不是(0,0)(0,1) (0,2)..连续,与线代中的矩阵不同

      

      例如:可以验证下面的关系:

      thread_id(2,1)block_id(1,0) coordinate(6,1) global index 14 ival 14

           

    三、 CUDA应用案例

      1. 查看GPU的一些硬件配置情况

     1 #include "device_launch_parameters.h"
     2 #include <iostream>
     3 
     4 int main()
     5 {
     6     int deviceCount;
     7     cudaGetDeviceCount(&deviceCount);
     8     for(int i=0;i<deviceCount;i++)
     9     {
    10         cudaDeviceProp devProp;
    11         cudaGetDeviceProperties(&devProp, i);
    12         std::cout << "使用GPU device " << i << ": " << devProp.name << std::endl;
    13         std::cout << "设备全局内存总量: " << devProp.totalGlobalMem / 1024 / 1024 << "MB" << std::endl;
    14         std::cout << "SM的数量:" << devProp.multiProcessorCount << std::endl;
    15         std::cout << "每个线程块的共享内存大小:" << devProp.sharedMemPerBlock / 1024.0 << " KB" << std::endl;
    16         std::cout << "每个线程块的最大线程数:" << devProp.maxThreadsPerBlock << std::endl;
    17         std::cout << "设备上一个线程块(Block)种可用的32位寄存器数量: " << devProp.regsPerBlock << std::endl;
    18         std::cout << "每个EM的最大线程数:" << devProp.maxThreadsPerMultiProcessor << std::endl;
    19         std::cout << "每个EM的最大线程束数:" << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
    20         std::cout << "设备上多处理器的数量: " << devProp.multiProcessorCount << std::endl;
    21         std::cout << "======================================================" << std::endl;     
    22         
    23     }
    24     return 0;
    25 }

    利用nvcc来编译程序

    nvcc  test.cu  -o  test

    输出结果:

     2.计算任务:将两个元素数目为1024*1024的float数组相加

        用CPU是如何串行完成任务:

      

    #include <iostream>
    #include <stdlib.h>
    #include <sys/time.h>
    #include <math.h>
    
    using namespace std;
    
    int main()
    {
        struct timeval start, end;
        gettimeofday( &start, NULL );
        float*A, *B, *C;
        int n = 1024 * 1024;
        int size = n * sizeof(float);
        A = (float*)malloc(size);
        B = (float*)malloc(size);
        C = (float*)malloc(size);
    
        for(int i=0;i<n;i++)
        {
            A[i] = 90.0;
            B[i] = 10.0;
        }
        
        for(int i=0;i<n;i++)
        {
            C[i] = A[i] + B[i];
        }
    
        float max_error = 0.0;
        for(int i=0;i<n;i++)
        {
            max_error += fabs(100.0-C[i]);
        }
        cout << "max_error is " << max_error << endl;
        gettimeofday( &end, NULL );
        int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
        cout << "total time is " << timeuse/1000 << "ms" <<endl;
        return 0;
    }

     用GPU做并行计算,速度的变化:

     编程的要点:

          每个block中的Thread数最大不超过512

          为了充分利用SM,Block数尽可能多,>100

    # include "cuda_runtime.h"
    # include <stdlib.h>
    # include <iostream>
    # include <sys/time.h>
     
    using namespace std;
    
    __galbal__  void add(float A[], float B[],float C[], int n)
    {
         int i =  blockDim.x * blockIdx.x + threadIdx.x;
         C[i] =   A[i] + B[i];    
    }
    int main()
    {
         struct timeval start , end;
         gettimeofday (&start,NULL)
     float*A, *Ad, *B, *Bd, *C, *Cd;
        int n = 1024 * 1024;
        int size = n * sizeof(float);
    
        // CPU端分配内存
        A = (float*)malloc(size);
        B = (float*)malloc(size);
        C = (float*)malloc(size);
    
        // 初始化数组
        for(int i=0;i<n;i++)
        {
            A[i] = 90.0;
            B[i] = 10.0;
        }
    
        // GPU端分配内存
        cudaMalloc((void**)&Ad, size);
        cudaMalloc((void**)&Bd, size);
        cudaMalloc((void**)&Cd, size);
    
        // CPU的数据拷贝到GPU端
        cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);
        cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
        cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);
    
        // 定义kernel执行配置,(1024*1024/512)个block,每个block里面有512个线程
        dim3 dimBlock(512);
        dim3 dimGrid(n/512);
    
        // 执行kernel
        Plus<<<dimGrid, dimBlock>>>(Ad, Bd, Cd, n);
    
        // 将在GPU端计算好的结果拷贝回CPU端
        cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);
    
        // 校验误差
        float max_error = 0.0;
        for(int i=0;i<n;i++)
        {
            max_error += fabs(100.0 - C[i]);
        }
    
        cout << "max error is " << max_error << endl;
    
        // 释放CPU端、GPU端的内存
        free(A);
        free(B);
        free(C);
        cudaFree(Ad);
        cudaFree(Bd);
        cudaFree(Cd);
        gettimeofday( &end, NULL );
        int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
        cout << "total time is " << timeuse/1000 << "ms" <<endl;
        return 0;
    }

     从这个程序中,使用CUDA编程时没有for循环,因为CPU编程的循环已经被分散到各个thread上做了,所以没有for一类的语句了。从结果可知,CPU的循环计算的速度比GPU计算快多了,原因在于CUDA中有大量的内存拷贝操作(数据的传输花费了大量时间,但计算时间却非常少),如果计算量比较小的话,CPU计算会更合适。

    接下来案例:矩阵加法,两个矩阵对应坐标的元素相加后的结果存储在第三个的对应位置的元素上。

    这个计算任务采用二维数据的计算方式,注意一下二维数组在CUDA编程中的写法。

    CPU版本:

     1 #include <stdlib.h>
     2 #include <iostream>
     3 #include <sys/time.h>
     4 #include <math.h>
     5 
     6 #define ROWS 1024
     7 #define COLS 1024
     8 
     9 using namespace std;
    10 
    11 int main()
    12 {
    13     struct timeval start, end;
    14     gettimeofday( &start, NULL );
    15     int *A, **A_ptr, *B, **B_ptr, *C, **C_ptr;
    16     int total_size = ROWS*COLS*sizeof(int);
    17     A = (int*)malloc(total_size);
    18     B = (int*)malloc(total_size);
    19     C = (int*)malloc(total_size);
    20     A_ptr = (int**)malloc(ROWS*sizeof(int*));
    21     B_ptr = (int**)malloc(ROWS*sizeof(int*));
    22     C_ptr = (int**)malloc(ROWS*sizeof(int*));
    23     
    24     //CPU一维数组初始化
    25     for(int i=0;i<ROWS*COLS;i++)
    26     {
    27         A[i] = 80;
    28         B[i] = 20;
    29     }
    30     
    31     for(int i=0;i<ROWS;i++)
    32     {
    33         A_ptr[i] = A + COLS*i;
    34         B_ptr[i] = B + COLS*i;
    35         C_ptr[i] = C + COLS*i;
    36     }
    37     
    38     for(int i=0;i<ROWS;i++)
    39         for(int j=0;j<COLS;j++)
    40         {
    41             C_ptr[i][j] = A_ptr[i][j] + B_ptr[i][j];
    42         }
    43         
    44     //检查结果
    45     int max_error = 0;
    46     for(int i=0;i<ROWS*COLS;i++)
    47     {
    48         //cout << C[i] << endl;
    49         max_error += abs(100-C[i]);
    50     }
    51     
    52     cout << "max_error is " << max_error <<endl;     
    53     gettimeofday( &end, NULL );
    54     int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    55     cout << "total time is " << timeuse/1000 << "ms" <<endl;
    56     
    57     return 0;
    58 }
    View Code

       GPU版本

     1 #include "cuda_runtime.h"
     2 #include "device_launch_parameters.h"
     3 #include <sys/time.h> 
     4 #include <stdio.h>
     5 #include <math.h>
     6 #define Row  1024
     7 #define Col 1024
     8  
     9  
    10 __global__ void addKernel(int **C,  int **A, int ** B)
    11 {
    12     int idx = threadIdx.x + blockDim.x * blockIdx.x;
    13     int idy = threadIdx.y + blockDim.y * blockIdx.y;
    14     if (idx < Col && idy < Row) {
    15         C[idy][idx] = A[idy][idx] + B[idy][idx];
    16     }
    17 }
    18  
    19 int main()
    20 {
    21     struct timeval start, end;
    22     gettimeofday( &start, NULL );
    23 
    24     int **A = (int **)malloc(sizeof(int*) * Row);
    25     int **B = (int **)malloc(sizeof(int*) * Row);
    26     int **C = (int **)malloc(sizeof(int*) * Row);
    27     int *dataA = (int *)malloc(sizeof(int) * Row * Col);
    28     int *dataB = (int *)malloc(sizeof(int) * Row * Col);
    29     int *dataC = (int *)malloc(sizeof(int) * Row * Col);
    30     int **d_A;
    31     int **d_B;
    32     int **d_C;
    33     int *d_dataA;
    34     int *d_dataB;
    35     int *d_dataC;
    36     //malloc device memory
    37     cudaMalloc((void**)&d_A, sizeof(int **) * Row);
    38     cudaMalloc((void**)&d_B, sizeof(int **) * Row);
    39     cudaMalloc((void**)&d_C, sizeof(int **) * Row);
    40     cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
    41     cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
    42     cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
    43     //set value
    44     for (int i = 0; i < Row*Col; i++) {
    45         dataA[i] = 90;
    46         dataB[i] = 10;
    47     }
    48     //将主机指针A指向设备数据位置,目的是让设备二级指针能够指向设备数据一级指针
    49     //A 和  dataA 都传到了设备上,但是二者还没有建立对应关系
    50     for (int i = 0; i < Row; i++) {
    51         A[i] = d_dataA + Col * i;
    52         B[i] = d_dataB + Col * i;
    53         C[i] = d_dataC + Col * i;
    54     }
    55                                                                 
    56     cudaMemcpy(d_A, A, sizeof(int*) * Row, cudaMemcpyHostToDevice);
    57     cudaMemcpy(d_B, B, sizeof(int*) * Row, cudaMemcpyHostToDevice);
    58     cudaMemcpy(d_C, C, sizeof(int*) * Row, cudaMemcpyHostToDevice);
    59     cudaMemcpy(d_dataA, dataA, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
    60     cudaMemcpy(d_dataB, dataB, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
    61     dim3 threadPerBlock(16, 16);
    62     dim3 blockNumber( (Col + threadPerBlock.x - 1)/ threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y );
    63     printf("Block(%d,%d)   Grid(%d,%d).
    ", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
    64     addKernel << <blockNumber, threadPerBlock >> > (d_C, d_A, d_B);
    65     //拷贝计算数据-一级数据指针
    66     cudaMemcpy(dataC, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
    67                                                                                              
    68     int max_error = 0;
    69     for(int i=0;i<Row*Col;i++)
    70     {
    71         //printf("%d
    ", dataC[i]);
    72         max_error += abs(100-dataC[i]);
    73     }
    74 
    75     //释放内存
    76     free(A);
    77     free(B);
    78     free(C);
    79     free(dataA);
    80     free(dataB);
    81     free(dataC);
    82     cudaFree(d_A);
    83     cudaFree(d_B);
    84     cudaFree(d_C);
    85     cudaFree(d_dataA);
    86     cudaFree(d_dataB);
    87     cudaFree(d_dataC);
    88 
    89     printf("max_error is %d
    ", max_error);
    90     gettimeofday( &end, NULL );
    91     int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    92     printf("total time is %d ms
    ", timeuse/1000);
    93 
    94     return 0;
    95 }
    View Code

     从结果看出:CPU计算时间还是比GPU的计算时间短。

     这种二维数组的程序写法的效率并不高(比较符合我们的思维方式),因为我们做了两次访存操作。

     一般而言,做高性能计算一般不采取这种编程方式。

    案例:矩阵乘法

    CPU版本:

     1 #include <iostream>
     2 #include <stdlib.h>
     3 #include <sys/time.h>
     4 
     5 #define ROWS 1024
     6 #define COLS 1024
     7 
     8 using namespace std;
     9 
    10 void matrix_mul_cpu(float* M, float* N, float* P, int width)
    11 {
    12     for(int i=0;i<width;i++)
    13         for(int j=0;j<width;j++)
    14         {
    15             float sum = 0.0;
    16             for(int k=0;k<width;k++)
    17             {
    18                 float a = M[i*width+k];
    19                 float b = N[k*width+j];
    20                 sum += a*b;
    21             }
    22             P[i*width+j] = sum;
    23         }
    24 }
    25 
    26 int main()
    27 {
    28     struct timeval start, end;
    29     gettimeofday( &start, NULL );
    30     float *A, *B, *C;
    31     int total_size = ROWS*COLS*sizeof(float);
    32     A = (float*)malloc(total_size);
    33     B = (float*)malloc(total_size);
    34     C = (float*)malloc(total_size);
    35 
    36     //CPU一维数组初始化
    37     for(int i=0;i<ROWS*COLS;i++)
    38     {
    39         A[i] = 80.0;
    40         B[i] = 20.0;
    41     }
    42 
    43     matrix_mul_cpu(A, B, C, COLS);
    44 
    45     gettimeofday( &end, NULL );
    46     int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    47     cout << "total time is " << timeuse/1000 << "ms" <<endl;
    48 
    49     return 0;
    50 }
    View Code

     

     GPU版本:

     1 #include "cuda_runtime.h"
     2 #include "device_launch_parameters.h"
     3 #include <sys/time.h> 
     4 #include <stdio.h>
     5 #include <math.h>
     6 #define Row  1024
     7 #define Col 1024
     8 
     9  
    10 __global__ void matrix_mul_gpu(int *M, int* N, int* P, int width)
    11 {
    12     int i = threadIdx.x + blockDim.x * blockIdx.x;
    13     int j = threadIdx.y + blockDim.y * blockIdx.y;
    14                 
    15     int sum = 0;
    16     for(int k=0;k<width;k++)
    17     {
    18         int a = M[j*width+k];
    19         int b = N[k*width+i];
    20         sum += a*b;
    21     }
    22     P[j*width+i] = sum;
    23 }
    24  
    25 int main()
    26 {
    27     struct timeval start, end;
    28     gettimeofday( &start, NULL );
    29 
    30     int *A = (int *)malloc(sizeof(int) * Row * Col);
    31     int *B = (int *)malloc(sizeof(int) * Row * Col);
    32     int *C = (int *)malloc(sizeof(int) * Row * Col);
    33     //malloc device memory
    34     int *d_dataA, *d_dataB, *d_dataC;
    35     cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
    36     cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
    37     cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
    38     //set value
    39     for (int i = 0; i < Row*Col; i++) {
    40         A[i] = 90;
    41         B[i] = 10;
    42     }
    43                                                                 
    44     cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
    45     cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
    46     dim3 threadPerBlock(16, 16);
    47     dim3 blockNumber((Col+threadPerBlock.x-1)/ threadPerBlock.x, (Row+threadPerBlock.y-1)/ threadPerBlock.y );
    48     printf("Block(%d,%d)   Grid(%d,%d).
    ", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
    49     matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
    50     //拷贝计算数据-一级数据指针
    51     cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
    52                                                                                              
    53     //释放内存
    54     free(A);
    55     free(B);
    56     free(C);
    57     cudaFree(d_dataA);
    58     cudaFree(d_dataB);
    59     cudaFree(d_dataC);
    60 
    61     gettimeofday( &end, NULL );
    62     int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
    63     printf("total time is %d ms
    ", timeuse/1000);
    64 
    65     return 0;
    66 }
    View Code

  • 相关阅读:
    Windows Install Twisted 安装Twisted
    raspberry pi随图形界面开机启动 被执行两次的问题
    将任意程序(如.bat文件)作为Windows服务运行
    xampp无法打开phpmyadmin解决方案
    python的subprocess无法进行通信(无法通过管道输入数据)的问题解决
    关于接地/共地
    树莓派字体安装
    windows 不能在本地计算机启动apache2 的解决方法(不是修改端口)
    5 November in 614
    模拟退火算法
  • 原文地址:https://www.cnblogs.com/lin1216/p/12340962.html
Copyright © 2011-2022 走看看