zoukankan      html  css  js  c++  java
  • CUDA -- 规约求矩阵的行和

      求矩阵每行的和?

      可以把每行放入一个不同线程块,这样行与行之间进行粗粒度的并行。而对于每行,其对应的线程块中分配n个线程(对应行宽),使用共享存储器,让每个线程从显存中读取一个数至shared memory中,然后使用规约算法计算和。

    代码如下:

    #include "cuda_runtime.h" //CUDA运行时API
    #include "device_launch_parameters.h"
    #include <iostream>
    #include <stdio.h>
    
    cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d);
    
    __global__ void addKernel(int *mat, int *ans, size_t pitch)
    {
        int bid = blockIdx.x;
        int tid = threadIdx.x;
        __shared__ int data[8];
        int *row = (int*)((char*)mat + bid*pitch);
        data[tid] = row[tid];
        __syncthreads();
        for (int i = 4; i > 0; i /= 2) {
            if (tid < i)
                data[tid] = data[tid] + data[tid + i];
            __syncthreads();
        }
        if (tid == 0)
            ans[bid] = data[0];
    }
    
    int main()
    {
        const int row = 4;
        const int col = 8;
        dim3 d(col, row);
        int mat[row][col] = { 1,2,3,4,5,1,2,3,
                            6,7,8,9,10,4,5,6,
                            11,12,13,14,15,7,8,9,
                            16,17,18,19,20,10,11,12 };
        int ans[row];
        // Add vectors in parallel.
        cudaError_t cudaStatus = addWithCuda(mat, ans, d);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "addWithCuda failed!
    ");
            return 1;
        }
        // cudaThreadExit must be called before exiting in order for profiling and
        // tracing tools such as Nsight and Visual Profiler to show complete traces.
        cudaStatus = cudaThreadExit();
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaThreadExit failed!");
            return 1;
        }
        for (int i = 0; i < d.y; i++)
        {
            std::cout << ans[i] << " ";
        }
        return 0;
    }
    
    
    // 重点理解这个函数
    cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d)
    {
        int *dev_mat = 0; //GPU设备端数据指针
        int *dev_ans = 0;
        int pitch;
        cudaError_t cudaStatus; //状态指示
                                // Choose which GPU to run on, change this on a multi-GPU system.
        cudaStatus = cudaSetDevice(0); //选择运行平台
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
            goto Error;
        }
        // 分配GPU设备端内存
        cudaStatus = cudaMallocPitch((void**)&dev_mat, (size_t *)&pitch, d.x * sizeof(int), d.y);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMalloc failed!
    ");
            goto Error;
        }
        cudaStatus = cudaMalloc((void**)&dev_ans, d.y * sizeof(int));
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMalloc failed!
    ");
            goto Error;
        }
        // 拷贝数据到GPU
        cudaStatus = cudaMemcpy2D(dev_mat, pitch, mat, d.x*sizeof(int), d.x*sizeof(int), d.y, cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMemcpy for dev_mat failed!
    ");
            goto Error;
        }
        cudaStatus = cudaMemcpy(dev_ans, ans, d.y * sizeof(int), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMemcpy for dev_ans failed!
    ");
            goto Error;
        }
        // 运行核函数
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start, 0);
        addKernel<<<d.y,d.x>>>(dev_mat, dev_ans, pitch);
        //addKernel_thd << <1, size >> >(dev_c, dev_a, dev_b);
    
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
        float tm;
        cudaEventElapsedTime(&tm, start, stop);
        printf("GPU Elapsed time:%.6f ms.
    ", tm);
        // cudaThreadSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
        cudaStatus = cudaThreadSynchronize(); //同步线程
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!
    ", cudaStatus);
            goto Error;
        }
        // Copy output vector from GPU buffer to host memory.
        cudaStatus = cudaMemcpy(ans, dev_ans, d.y * sizeof(int), cudaMemcpyDeviceToHost); //拷贝结果回主机
        if (cudaStatus != cudaSuccess)
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
    Error:
        cudaFree(dev_mat); //释放GPU设备端内存
        cudaFree(dev_ans);
        return cudaStatus;
    }
  • 相关阅读:
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
    jQuery火箭图标返回顶部代码
  • 原文地址:https://www.cnblogs.com/chen9510/p/11506257.html
Copyright © 2011-2022 走看看