zoukankan      html  css  js  c++  java
  • CUDA实例练习(十二):矩阵相乘

     1 #include <stdio.h>
     2 #include <cuda_runtime.h>
     3 #include <device_launch_parameters.h>
     4 #include <stdlib.h>
     5 #include <time.h>
     6 
     7 #define THREAD_NUM 256
     8 #define MATRIX_SIZE 1000
     9 const int blocks_num = MATRIX_SIZE * (MATRIX_SIZE + THREAD_NUM - 1) / THREAD_NUM;
    10 void matgen(float *a, int n);
    11 
    12 ///生成随机矩阵
    13 void matgen(float *a, int n){
    14     int i, j;
    15 
    16     for (i = 0; i < n; i++){
    17         for (j = 0; j < n; j++){
    18             a[i * n + j] = (float)rand();
    19             //printf("%f ", a[i*n + j]);
    20         }
    21     }
    22 }
    23 
    24 //__global__函数  并行计算矩阵乘法
    25 __global__ static void matMult(const float *a, const float *b, float * c,int n){
    26     //表示目前的thread是第几个thread(由0开始计算)
    27     const int tid = threadIdx.x;
    28 
    29     //表示目前的thread属于第几个block(由0开始计算)
    30     const int bid = blockIdx.x;
    31 
    32     //从bid和tid计算出这个thread应该计算的row和column
    33     const int idx = bid * THREAD_NUM + tid;
    34     const int row = idx / n;
    35     const int column = idx % n;
    36 
    37     //计算矩阵乘法
    38     if (row < n && column < n){
    39         float t = 0;
    40         for (int i = 0; i < n; i++){
    41             t += a[row * n + i] * b[i * n + column];
    42         }
    43         c[row * n + column] = t;
    44     }
    45 }
    46 
    47 int main(void){
    48     cudaEvent_t stop, start;
    49     cudaEventCreate(&start);
    50     cudaEventCreate(&stop);
    51 
    52     //定义矩阵
    53     float *a, *b, *c;
    54     int n = MATRIX_SIZE;
    55     float elapsedTime = 0;
    56 
    57     //分配内存
    58     a = (float *)malloc(sizeof(float) * n * n);
    59     b = (float *)malloc(sizeof(float) * n * n);
    60     c = (float *)malloc(sizeof(float) * n * n);
    61 
    62     //设置随机数种子
    63     srand(0);
    64 
    65     //随机生成矩阵
    66     matgen(a, n);
    67     matgen(b, n);
    68 
    69     //分配GPU内存
    70     float *d_a, *d_b, *d_c;
    71     cudaMalloc((void**)&d_a, sizeof(float) * n * n);
    72     cudaMalloc((void**)&d_b, sizeof(float) * n * n);
    73     cudaMalloc((void**)&d_c, sizeof(float) * n * n);
    74     cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice);
    75     cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice);
    76 
    77     cudaEventRecord(start, 0);
    78     matMult << <blocks_num, THREAD_NUM, 0 >> >(d_a, d_b, d_c,n);
    79     cudaThreadSynchronize();
    80     cudaEventRecord(stop, 0);
    81     cudaEventSynchronize(stop);
    82     cudaEventElapsedTime(&elapsedTime, start, stop);
    83     printf("%f
    ", elapsedTime);
    84 
    85     cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
    86     /*for (int i = 0; i < 100; i++){
    87         printf("%f ", c[i]);
    88     }*/
    89 
    90     cudaFree(d_a);
    91     cudaFree(d_b);
    92     cudaFree(d_c);
    93 
    94     return 0;
    95 }
  • 相关阅读:
    Serverless
    Kubernetes
    下一代微服务-ServiceMesh
    SOA服务治理
    微服务架构
    RPC之Thrift
    【MySQL】MySQL高可用架构之MHA
    远程通信的几种选择(RPC,Webservice,RMI,JMS的区别)
    Redis与Memcached的区别
    LVS简介
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7423164.html
Copyright © 2011-2022 走看看