zoukankan      html  css  js  c++  java
  • cuda编程-矩阵乘法(1)

    本方法采用简单的单线程计算每组行和列乘加运算

    代码如下:

    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <cuda_runtime.h>
    
    __global__ void matrixMulKernel(float *C, float *A, float *B, int width, int height){
      int tx = blockIdx.x * blockDim.x + threadIdx.x;
      int ty = blockIdx.y * blockDim.y + threadIdx.y;
      if(tx >= width || ty >= height)
        return;
    
      float sum = 0;
      for(int i=0; i<width; ++i){
        sum += A[ty * width + i] * B[i * width + tx];
      }
    
      C[ty * width + tx] = sum;
    }
    
    void constantInit(float *data, int size, float val){
        for (int i = 0; i < size; ++i){
            data[i] = val;
        }
    }
    
    
    void matrixMul(){
      unsigned int width = 128;
      unsigned int height = 128;
      unsigned int size = width * height * sizeof(float);
      float *h_A = (float*)malloc(size);
      float *h_B = (float*)malloc(size);
      float *h_C = (float*)malloc(size);
      // Initialize host memory
      const float valB = 0.01f;
      constantInit(h_A, width*height, 1.0f);
      constantInit(h_B, width*height, valB);
    
      float *d_A, *d_B, *d_C;
      cudaMalloc((void**)&d_A, size);
      cudaMalloc((void**)&d_B, size);
      cudaMalloc((void**)&d_C, size);
    
      //copy host memory to device
      cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
      cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
      //config dims
      dim3 block(16, 16);
      dim3 grid(width / block.x, height / block.y);
    
      // Excute the kernel
      matrixMulKernel<<<grid, block>>>(d_C, d_A, d_B, width, height);
    
      // Copy the memory from device to host
      cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    
      printf("Checking computed result for correctness: ");
      bool correct = true;
      // test relative error by the formula
      //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
      double eps = 1.e-6 ; // machine zero
    
      for (int i = 0; i < width*height; i++){
          double abs_err = fabs(h_C[i] - (width * valB));
          double dot_length = width;
          double abs_val = fabs(h_C[i]);
          double rel_err = abs_err/abs_val/dot_length ;
          if (rel_err > eps)
          {
              printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E
    ", i, h_C[i], (float)(width*height), eps);
              correct = false;
          }
      }
      printf("%s
    ", correct ? "Result = PASS" : "Result = FAIL");
    
      // Free
      free(h_A);
      free(h_B);
      free(h_C);
      cudaFree(d_A);
      cudaFree(d_B);
      cudaFree(d_C);
    
    }
    
    int main(){
      matrixMul();
    }
  • 相关阅读:
    树莓派添加桌面快捷方式
    计算机网络
    django-auth2
    令牌桶算法-python
    linux centos-7 添加开机自启动脚本
    pymongodb-explain
    哈希表
    tcp/udp
    jemeter之jmeter+ant+jenkins搭建接口自动化测试环境
    jmeter之jmeter + ant + jenkins(二)Jenkins安装
  • 原文地址:https://www.cnblogs.com/haiyang21/p/7788989.html
Copyright © 2011-2022 走看看