zoukankan      html  css  js  c++  java
  • CUDA实例练习(四):矩阵转置

     1 #include <stdio.h>
     2 #include "gputimer.h"
     3 #include "cuda_runtime.h"
     4 #include "device_launch_parameters.h"
     5 #include <stdlib.h>
     6 
     7 const int N = 1024;
     8 const int K = 32;
     9 
    10 void fill_matrix(float * mat){
    11     for (int i = 0; i < N*N; i++)
    12         mat[i] = (float)i;
    13 }
    14 
    15 void print_matrix(float *mat)
    16 {
    17     for (int j = 0; j < N; j++)
    18     {
    19         for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
    20         printf("
    ");
    21     }
    22 }
    23 
    24 __global__ void transpose_serial(float in[], float out[]){
    25     for (int i = 0; i < N; i++)
    26         for (int j = 0; j < N; j++)
    27             out[i + j*N] = in[j + i*N];
    28 }
    29 
    30 __global__ void transpose_parallel_per_row(float in[], float out[]){
    31     int i = threadIdx.x;
    32 
    33     for (int j = 0; j < N; j++)
    34         out[j + i*N] = in[i + j*N];
    35 }
    36 
    37 __global__ void transpose_parallel_per_element(float in[], float out[]){
    38     int i = blockIdx.x * K + threadIdx.x;
    39     int j = blockIdx.y * K + threadIdx.y;
    40     out[j + i*N] = in[i + j*N];
    41 }
    42 int main(void){
    43     int numbytes = N * N * sizeof(float);
    44 
    45     float *in = (float *)malloc(numbytes);
    46     float *out = (float *)malloc(numbytes);
    47     fill_matrix(in);
    48     
    49     float *d_in, *d_out;
    50 
    51     cudaMalloc((void **)&d_in, numbytes);
    52     cudaMalloc((void **)&d_out, numbytes);
    53     cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
    54 
    55     GpuTimer timer;
    56     timer.Start();
    57     transpose_serial << <1, 1 >> >(d_in, d_out);
    58     timer.Stop();
    59     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    60     printf("transpose_serial:%g ms.
    ", timer.Elapsed());
    61 
    62     timer.Start();
    63     transpose_parallel_per_row << <1, N >> >(d_in, d_out);
    64     timer.Stop();
    65     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    66     printf("transpose_parallel_per_row:%g ms.
    ", timer.Elapsed());
    67 
    68     dim3 blocks(N / K, N / K);
    69     dim3 threads(K, K);
    70     timer.Start();
    71     transpose_parallel_per_element << <blocks, threads >> >(d_in, d_out);
    72     timer.Stop();
    73     cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
    74     printf("transpose_parallel_per_element: %g ms.
    ", timer.Elapsed());
    75 
    76     cudaFree(d_in);
    77     cudaFree(d_out);
    78 
    79     return 0;
    80 }
  • 相关阅读:
    JavaSE 基础 第51节 定义自己的异常
    JavaSE 基础 第50节 Java中的异常链
    JavaSE 基础 第49节 手动抛出异常
    JavaSE 基础 第48节 Java中的异常声明
    JavaSE 基础 第47节 获取异常信息
    JavaSE 基础 第46节 异常的分类
    JavaSE 基础 第45节Java异常快速入门
    JavaSE 基础 第44节 引用外部类的对象
    JavaSE 基础 第43节 静态内部类
    通用爬虫
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7263567.html
Copyright © 2011-2022 走看看