zoukankan      html  css  js  c++  java
  • CUDA实战2

    1.clock()函数是C/C++中的计时函数,相关的数据类型是clock_t,使用clock函数可以计算运行某一段程序所需的时间,如下所示程序计算从10000000逐渐减一直到0所需的时间。

    注:每次运行所需时间可能会不一样

     1 #include "cuda_runtime.h"
     2 #include "device_launch_parameters.h"
     3 #include <stdio.h>
     4 #include <time.h>
     5 int main()
     6 {
     7     //测试clock_t的使用
     8     clock_t start, end;
     9     long n = 10000000L;
    10     double duration;
    11     printf("使 %ld 循环减一变为 0 所需的时间是:",n);
    12     //开始时间
    13     start = clock();
    14     //循环减一
    15     while(n--);
    16     //结束时间
    17     end = clock();
    18     //计算整个过程的时间结束时间减开始时间), 
    19     //CLOCKS_PER_SEC是"time.h"文件中定义的常量,
    20     //表示一秒钟包含多少时钟计时单元(即毫秒)。
    21     duration = (double)(end-start) / CLOCKS_PER_SEC;
    22     printf(" %f 秒
    ",duration);
    23     return 0;
    24 }
    View Code

     

     

     2.

    CLOCKS_PER_SEC,它用来表示一秒钟会有多少个时钟计时单元,其定义如下:
    #define CLOCKS_PER_SEC ((clock_t)1000)
    可以看到每过千分之一秒(1毫秒),调用clock()函数返回的值就加1。
    可以使用公式clock()/CLOCKS_PER_SEC来计算一个进程自身的运行时间。
     

    矢量求和运算

    假设我们有两组数据,我们需要将这两组数据中对应的元素两两相加,并将结果保存在第三个数组中。

      1 //CUDA的头文件
      2 #include "cuda_runtime.h"
      3 #include "device_launch_parameters.h"
      4 //C语言的头文件
      5 #include <stdio.h>
      6 #include <time.h>
      7 
      8 #define N 6000
      9 #define thread_num 1024
     10 
     11 //GPU函数声明
     12 __global__ void add(int* a, int* b, int* c);
     13 //CPU函数声明
     14 void add_CPU(int *a, int *b,int *c);
     15 
     16 int main()
     17 {
     18     //GPU方法计时声明
     19     float time_CPU, time_GPU;
     20     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     21     //CPU方法计时声明
     22     float time_cpu, time_gpu;
     23     clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
     24     int a[N], b[N], c[N], c_CPU[N];
     25     int *dev_a, *dev_b, *dev_c;
     26     
     27     int block_num;
     28     block_num = (N + thread_num - 1)/thread_num;
     29     
     30     //在GPU上分配内存
     31     cudaMalloc((void**)&dev_a, N*sizeof(int));
     32     cudaMalloc((void**)&dev_b, N*sizeof(int));
     33     cudaMalloc((void**)&dev_c, N*sizeof(int));
     34     
     35     //在CPU上进行赋值
     36     for(int i = 0; i < N; i++)
     37     {
     38         a[i] = -i;
     39         b[i] = i*i;
     40     }
     41     
     42  
     43     //记录当前时间
     44     start_cpu = clock();
     45     
     46     add_CPU(a, b, c_CPU);
     47     
     48     stop_cpu = clock();
     49     //记录当前时间
     50     printf("Tne time for CPU:	%f(ms)
    ", (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC);
     51     
     52     
     53     //输出CPU结果
     54     printf("
    Result from CPU:
    ");
     55     for(int i = 0; i<N; i++)
     56     {
     57         printf("CPU:	%d+%d=%d
    ",a[i],b[i],c_CPU[i]);
     58     }
     59     
     60     //GPU计算
     61     cudaMemcpy(dev_a,a,N*sizeof(int), cudaMemcpyHostToDevice);
     62     cudaMemcpy(dev_b,b,N*sizeof(int), cudaMemcpyHostToDevice);
     63     
     64     //创建Event
     65     cudaEventCreate(&start_GPU);
     66     cudaEventCreate(&stop_GPU);
     67     
     68     //记录当时时间
     69     cudaEventRecord(start_GPU,0);
     70     start_gpu = clock();
     71     //调用核函数
     72     add<<<block_num,thread_num>>>(dev_a,dev_b,dev_c);
     73     
     74     stop_gpu = clock();
     75     //记录当时时间
     76     cudaEventRecord(stop_GPU,0);
     77     cudaEventSynchronize(start_GPU);
     78     cudaEventSynchronize(stop_GPU);
     79     cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU);
     80     printf("
    The time from GPU :	%f(ms)
    ",time_GPU);
     81     
     82     //将device复制到host
     83     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
     84     //将GPU中的结果拷贝出来
     85     cudaMemcpy(c,dev_c,N*sizeof(int),cudaMemcpyDeviceToHost);
     86     
     87     //输出
     88     printf("
    Result from GPU:
    ");
     89     for(int i = 0; i<N; i++)
     90     {
     91         printf("GPU:	%d+%d=%d
    ",a[i],b[i],c[i]);
     92     }
     93     cudaEventDestroy(start_GPU);
     94     cudaEventDestroy(stop_GPU);
     95     
     96     //释放内存
     97     cudaFree(dev_a);
     98     cudaFree(dev_b);
     99     cudaFree(dev_c);
    100     printf("
    The time for CPU by event:	%f(ms)
    ", time_CPU);
    101     printf("The time for GPU by event:	%f(ms)
    ", time_GPU);
    102     
    103     time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
    104     time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC;
    105     printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    106     printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    107     
    108     
    109     return 0;
    110 }
    111 //GPU函数
    112 __global__ void add(int *a, int *b, int *c)
    113 {
    114     int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据
    115     if (tid < N)
    116     {
    117         c[tid] = a[tid] + b[tid];
    118     }
    119 }
    120 
    121 //CPU函数
    122 void add_CPU(int *a, int *b, int *c)
    123 {
    124     for (int i = 0; i < N; i++)
    125     {
    126         c[i] = a[i] + b[i];
    127     }
    128 }

     3.CUDA实现矩阵乘

      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 #include <time.h>
      4 #include "cuda_runtime.h"
      5 #include "device_launch_parameters.h"
      6 #define M 1024
      7 #define K 1024
      8 #define N 1024
      9 
     10 void initial(double* list,int row,int col)
     11 {
     12     double *num = list;
     13 //    srand((unsigned)time(NULL));
     14     for (int i=0; i<row*col; i++)
     15     {
     16         num[i] = rand()%10;
     17     }
     18 }
     19 
     20 void CpuMatrix(double *A,double *B,double *C)
     21 {
     22        int i,j,k;
     23        
     24        for( i=0; i<M; i++)
     25        {
     26            for(j=0; j<N; j++)
     27            {
     28                double sum = 0;
     29                for(int k=0; k<K; k++)
     30                {
     31                    sum += A[i*K + k] * B[k * N + j];
     32                }
     33                C[i * N + j] = sum;
     34            }
     35        }
     36 }
     37 
     38 __global__ void GpuMatrix(double *dev_A,double *dev_B,double *dev_C)
     39 {
     40     int ix = threadIdx.x + blockDim.x * blockIdx.x;
     41     int iy = threadIdx.y + blockDim.y * blockIdx.y;
     42     
     43     if(ix<K && iy<M)
     44     {
     45     double sum = 0;
     46     for( int k = 0; k < K;k++)
     47     {
     48         sum += dev_A[iy*K + k] * dev_B[k*N + ix];
     49     }
     50     dev_C[iy * N + ix] = sum;
     51    }
     52 }
     53 
     54 void printMatrix(double *list,int row,int col)
     55 {
     56     double *p = list;
     57     for(int i=0; i<row; i++)
     58     {
     59         for(int j=0; j<col; j++)
     60         {
     61             printf("%10lf",p[j]);
     62         }
     63         p = p + col;
     64         printf("
    ");
     65     }
     66 }
     67 
     68 int main(int argc,char **argv)
     69 {
     70     clock_t start_cpu,stop_cpu,start_gpu,stop_gpu;
     71     double time_cpu,time_gpu;
     72     
     73     float time_CPU, time_GPU;
     74     cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
     75     
     76     //printf("Amatrix:(%d*%d)
    ",M,K);
     77     int Axy = M*K;
     78     int Abytes = Axy * sizeof(double);
     79     
     80 //    printf("Bmatrix:(%d*%d)
    ",K,N);
     81     int Bxy = K*N;
     82     int Bbytes = Bxy * sizeof(double);
     83     
     84     int nxy = M*N;
     85     int nbytes = nxy * sizeof(double);
     86     
     87     double *host_A, *host_B, *host_C, *c_CPU;
     88     host_A = (double*)malloc(Abytes);
     89     host_B = (double*)malloc(Bbytes);
     90     host_C = (double*)malloc(nbytes);
     91     c_CPU = (double*)malloc(nbytes);
     92     
     93     //初始化
     94     initial(host_A,M,K);
     95     //输出
     96     printf("A:(%d,%d):
    ",M,K);
     97 //    printMatrix(host_A,M,K);
     98     
     99     initial(host_B,K,N);
    100     //输出
    101     printf("B:(%d,%d):
    ",K,N);
    102 //    printMatrix(host_B,K,N);
    103    
    104     start_cpu = clock();
    105     CpuMatrix(host_A,host_B,host_C);
    106     stop_cpu = clock();
    107     
    108     printf("The time from CPU is 	%f(ms)
    ",(float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC);
    109     //输出
    110     printf("Host_C:(%d,%d):
    ",M,N);
    111 //    printMatrix(host_C,M,N);
    112     
    113     //GPU计算
    114     double *dev_A,*dev_B,*dev_C;
    115     cudaMalloc((void**)&dev_A,Axy*sizeof(double));
    116     cudaMalloc((void**)&dev_B,Bxy*sizeof(double));
    117     cudaMalloc((void**)&dev_C,nxy*sizeof(double));
    118     dim3 block(1024,1024);
    119     dim3 grid(32,32); 
    120     cudaMemcpy(dev_A,host_A,Abytes,cudaMemcpyHostToDevice);
    121     cudaMemcpy(dev_B,host_B,Bbytes,cudaMemcpyHostToDevice);
    122     //创建Event 
    123     cudaEventCreate(&start_GPU);
    124     cudaEventCreate(&stop_GPU);
    125     cudaEventRecord(start_GPU,0);
    126     start_gpu = clock();
    127     
    128     GpuMatrix<<<grid,block>>>(dev_A,dev_B,dev_C);
    129     
    130     stop_gpu = clock();
    131     cudaEventRecord(stop_GPU,0);
    132     cudaEventSynchronize(start_GPU);
    133     cudaEventSynchronize(stop_GPU);
    134     //计算时间差
    135     cudaEventElapsedTime(&time_GPU,start_GPU,stop_GPU);
    136     printf("The time from GPU is 	%f(ms)
    ",time_GPU);
    137     //消除Event
    138     cudaEventDestroy(start_GPU);
    139     cudaEventDestroy(stop_GPU);
    140     
    141     cudaMemcpy(c_CPU,dev_C,nbytes,cudaMemcpyDeviceToHost);
    142     //输出
    143     printf("device_C:(%d,%d):
    ",M,N);
    144 //    printMatrix(c_CPU,M,N);
    145     
    146     //释放内存
    147      cudaFree(dev_A);
    148      cudaFree(dev_B);
    149      cudaFree(dev_C);
    150      free(host_A);
    151      free(host_B);
    152      free(host_C);
    153      free(c_CPU);
    154      
    155      time_cpu = (float) (stop_cpu-start_cpu) / CLOCKS_PER_SEC;
    156      time_gpu = (float) (stop_gpu-start_gpu) / CLOCKS_PER_SEC;
    157      printf("
    The time for CPU by host:	%f(ms)
    ", time_cpu);
    158      printf("The time for GPU by host:	%f(ms)
    ", time_gpu);
    159      return 0;
    160 
    161 }

     
     
  • 相关阅读:
    Java实现 蓝桥杯 历届试题 连号区间数
    Java实现 蓝桥杯 历届试题 连号区间数
    Java实现 蓝桥杯 历届试题 连号区间数
    Java实现 蓝桥杯 历届试题 连号区间数
    Java实现 蓝桥杯 历届试题 连号区间数
    Java实现 蓝桥杯 历届试题 大臣的旅费
    Java实现 蓝桥杯 历届试题 大臣的旅费
    Java实现 蓝桥杯 历届试题 大臣的旅费
    Java实现 蓝桥杯 历届试题 大臣的旅费
    Navicat查询哪些表有指定字段名
  • 原文地址:https://www.cnblogs.com/lin1216/p/12677841.html
Copyright © 2011-2022 走看看