zoukankan      html  css  js  c++  java
  • cuda编程学习6——点积dot

    __shared__ float cache[threadPerBlock];//声明共享内存缓冲区,__shared__

    __syncthreads();//对线程块中的线程进行同步,只有都完成前面的任务才可以进行后面的

    代码:

    /*
    ============================================================================
    Name : dot.cu
    Author : can
    Version :
    Copyright : Your copyright notice
    Description : CUDA compute reciprocals
    ============================================================================
    */

    #include <iostream>
    using namespace std;

    static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
    #define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)

    #define imin(a,b) (a<b?a:b)
    const int N=33*1024;
    const int threadPerBlock=256;
    const int blockPerGrid=imin(32,(N+threadPerBlock-1)/threadPerBlock);

    __global__ void dot(float *a,float *b,float *c)
    {
    __shared__ float cache[threadPerBlock];//声明共享内存缓冲区,__shared__,
    int tid = threadIdx.x + blockIdx.x*blockDim.x;
    int cacheIndex = threadIdx.x;
    float temp = 0;
    while(tid < N)
    {
    temp += a[tid] * b[tid];
    tid += blockDim.x*gridDim.x;
    }
    cache[cacheIndex] = temp;
    __syncthreads();//对线程块中的线程进行同步,只有都完成前面的任务才可以进行后面的
    int i = blockDim.x/2;//归约运算
    while(i != 0)
    {
    if(cacheIndex < i)
    {
    cache[cacheIndex] += cache[cacheIndex + i];
    }
    __syncthreads();
    i /=2;
    }
    if(cacheIndex == 0)
    {
    c[blockIdx.x] = cache[0];
    }
    }

    int main()
    {
    float *a,*b,c,*partial_c;
    float *dev_a,*dev_b,*dev_partial_c;
    a = (float *)malloc(N*sizeof(float));
    b = (float *)malloc(N*sizeof(float));
    partial_c = (float *)malloc(blockPerGrid*sizeof(float));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_a,N*sizeof(float)));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_b,N*sizeof(float)));
    CUDA_CHECK_RETURN(cudaMalloc((void **)&dev_partial_c,N*sizeof(float)));
    for(int i=0;i<N;i++)
    {
    a[i] = i;
    b[i] = i*2;
    }
    CUDA_CHECK_RETURN(cudaMemcpy(dev_a,a,N*sizeof(float),cudaMemcpyHostToDevice));
    CUDA_CHECK_RETURN(cudaMemcpy(dev_b,b,N*sizeof(float),cudaMemcpyHostToDevice));
    dot<<<blockPerGrid,threadPerBlock>>>(dev_a,dev_b,dev_partial_c);
    CUDA_CHECK_RETURN(cudaMemcpy(partial_c,dev_partial_c,blockPerGrid*sizeof(float),cudaMemcpyDeviceToHost));
    c=0;
    for(int i=0;i<blockPerGrid;i++)
    {
    c += partial_c[i];
    }
    #define sum_squares(x) (x*(x+1)*(2*x+1)/6)
    cout<<"Does GPU value "<<c<<" = "<<2*sum_squares((float)(N-1))<<endl;
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_partial_c);
    free(a);
    free(b);
    free(partial_c);
    return 0;
    }

    static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
    {
    if (err == cudaSuccess)
    return;
    std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
    exit (1);
    }

  • 相关阅读:
    分析一个文本文件(英文文章)中各个词出现的频率,并且把频率最高的10个词打印出来
    求一个数组中的最大整数
    一个统计文本文件中各个英文单词出现频率的问题,并且输出频率最高的10个词
    Python学习一:基础语法
    Spring学习之二
    Spring学习之装配Bean
    Spring学习一
    缓存之ehcache
    解决axios传递参数后台无法接收问题
    服务端解决跨域问题
  • 原文地址:https://www.cnblogs.com/shrimp-can/p/5046664.html
Copyright © 2011-2022 走看看