zoukankan      html  css  js  c++  java
  • CUDA[4] sample program: matrix-vector multiplication

    Use Compressed Sparse Row Format (CSR) to represent matrix

      1 #include "cuda_runtime.h"
      2 #include "device_launch_parameters.h"
      3 #include "gputimer.h"
      4 #include<stdio.h>
      5 #include<stdlib.h>
      6 #include<string.h>
      7 #define WARP_SIZE 32
      8 
      9 __global__ void
     10 spmv_csr_vector_kernel ( const int num_rows ,
     11                          const int * ptr ,
     12                          const int * indices ,
     13                          const double * data ,
     14                          const double * x ,
     15                          double * y)
     16 {
     17     __shared__ double vals [WARP_SIZE];
     18     int thread_id = blockDim.x * blockIdx.x + threadIdx.x ; // global thread index
     19     int warp_id = thread_id / WARP_SIZE; // global warp index
     20     int lane = thread_id & (WARP_SIZE - 1); // thread index within the warp
     21     // one warp per row
     22     int row = warp_id ;
     23     if ( row < num_rows )
     24     {
     25         int row_start = ptr [ row ];
     26         int row_end = ptr [ row +1];
     27         // compute running sum per thread
     28         vals [ threadIdx.x ] = 0;
     29         for ( int jj = row_start + lane ; jj < row_end ; jj += WARP_SIZE)
     30         vals [ threadIdx.x ] += data [ jj ] * x [ indices [ jj ]];
     31         // parallel reduction in shared memory
     32         if ( lane < 16) vals [ threadIdx.x ] += vals [ threadIdx.x + 16];
     33         if ( lane < 8) vals [ threadIdx.x ] += vals [ threadIdx.x + 8];
     34         if ( lane < 4) vals [ threadIdx.x ] += vals [ threadIdx.x + 4];
     35         if ( lane < 2) vals [ threadIdx.x ] += vals [ threadIdx.x + 2];
     36         if ( lane < 1) vals [ threadIdx.x ] += vals [ threadIdx.x + 1];
     37         // first thread writes the result
     38         if ( lane == 0)
     39         y[ row ] += vals [ threadIdx.x ];
     40     }
     41 }
     42 
     43 __global__ void
     44 spmv_csr_scalar_kernel ( const int num_rows ,
     45                          const int * ptr ,
     46                          const int * indices ,
     47                          const double * data ,
     48                          const double * x ,
     49                          double * y)
     50 {
     51     int row = blockDim.x * blockIdx.x + threadIdx.x ;
     52     if( row < num_rows )
     53     {
     54         double dot = 0;
     55         int row_start = ptr [ row ];
     56         int row_end = ptr [ row +1];
     57         for (int jj = row_start ; jj < row_end ; jj ++)
     58             dot += data [ jj ] * x[ indices [ jj ]];
     59         y[ row ] += dot ;
     60     }
     61 }
     62 
     63 int main(int argc,char **argv)
     64 {
     65     double h_data[32]={1,4,2,3,5,7,8,9,6};
     66     int h_col[32]={0,1,1,2,0,3,4,2,4};
     67     int h_ptr[32]={0,2,4,7,9};
     68     double h_x[32]={1,2,3,4,5};
     69     double h_y[32]={0,0,0,0};
     70     int num_rows=4;
     71 
     72     double *d_data;
     73     int *d_col;
     74     int *d_ptr;
     75     double *d_x;
     76     double *d_y;
     77 
     78     cudaMalloc((void**) &d_data,sizeof(double)*32);
     79     cudaMalloc((void**) &d_col,sizeof(int)*32);
     80     cudaMalloc((void**) &d_ptr,sizeof(int)*32);
     81     cudaMalloc((void**) &d_x,sizeof(double)*32);
     82     cudaMalloc((void**) &d_y,sizeof(double)*32);
     83     cudaMemcpy((void*)d_data, (void*)h_data, sizeof(double)*32, cudaMemcpyHostToDevice);
     84     cudaMemcpy((void*)d_col, (void*)h_col, sizeof(int)*32, cudaMemcpyHostToDevice);
     85     cudaMemcpy((void*)d_ptr, (void*)h_ptr, sizeof(int)*32, cudaMemcpyHostToDevice);
     86     cudaMemcpy((void*)d_x, (void*)h_x, sizeof(double)*32, cudaMemcpyHostToDevice);
     87     cudaMemcpy((void*)d_y, (void*)h_y, sizeof(double)*32, cudaMemcpyHostToDevice);
     88 
     89     GpuTimer timer;
     90     timer.Start();
     91     spmv_csr_vector_kernel<<<num_rows,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
     92     //spmv_csr_scalar_kernel<<<1,32>>>(num_rows,d_ptr,d_col,d_data,d_x,d_y);
     93     timer.Stop();
     94     printf("Duration: %g ms
    ",timer.Elapsed());
     95 
     96     cudaMemcpy((void*)h_y, (void*)d_y, sizeof(double)*32, cudaMemcpyDeviceToHost);
     97 
     98     for(int i=0;i<num_rows;i++)
     99         printf("%.5f ",h_y[i]);
    100     printf("
    ");
    101 
    102     return 0;
    103 }
    View Code

    ref:

    http://www.nvidia.com/docs/IO/66889/nvr-2008-004.pdf  

    ch4.3

  • 相关阅读:
    Spring Boot发布2.6.2、2.5.8:升级log4j2到2.17.0
    如何优雅地读写HttpServletRequest和HttpServletResponse的请求体
    更快的Maven来了
    Spring Cloud Gateway过滤器精确控制异常返回(实战,控制http返回码和message字段)
    NumPy学习笔记
    来自Java程序员的Python新手入门小结
    Java应用日志如何与Jaeger的trace关联
    Jaeger知识点补充
    分布式调用链跟踪工具Jaeger?两分钟极速体验
    Spring Cloud Gateway过滤器精确控制异常返回(实战,完全定制返回body)
  • 原文地址:https://www.cnblogs.com/pdev/p/6346761.html
Copyright © 2011-2022 走看看