zoukankan      html  css  js  c++  java
  • CUDA学习:第一CUDA代码:数组求和

    今天有些收获了,成功运行了数组求和代码:就是将N个数相加求和

    //环境:CUDA5.0,vs2010

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"

    #include <stdio.h>

    cudaError_t addWithCuda(int *c, int *a);


    #define TOTALN 72120
    #define BLOCKS_PerGrid 32
    #define THREADS_PerBlock 64 //2^8

    __global__ void SumArray(int *c, int *a)//, int *b)
    {
    __shared__ unsigned int mycache[THREADS_PerBlock];//设置每个块内同享内存threadsPerBlock==blockDim.x

    int i = threadIdx.x+blockIdx.x*blockDim.x;
    int j = gridDim.x*blockDim.x;//每个grid里一共有多少个线程
    int cacheN;
    unsigned sum,k;

    sum=0;

    cacheN=threadIdx.x; //

    while(i<TOTALN)
    {
    sum += a[i];// + b[i];
    i = i+j;
    }

    mycache[cacheN]=sum;

    __syncthreads();//对线程块进行同步;等待该块里所有线程都计算结束


    //下面开始计算本block中每个线程得到的sum(保存在mycache)的和
    //递归方法:(参考《GPU高性能编程CUDA实战中文》)
    //1:线程对半加:

    k=THREADS_PerBlock>>1;
    while(k)
    {
    if(cacheN<k)
    {
    //线程号小于一半的线程继续运行这里加
    mycache[cacheN] += mycache[cacheN+k];//数组序列对半加,得到结果,放到前半部分数组,为下次递归准备
    }
    __syncthreads();//对线程块进行同步;等待该块里所有线程都计算结束
    k=k>>1;//数组序列,继续对半,准备后面的递归
    }

    //最后一次递归是在该块的线程0中进行,所有把线程0里的结果返回给CPU
    if(cacheN==0)
    {
    c[blockIdx.x]=mycache[0];
    }


    }

    int main()
    {

    int a[TOTALN] ;
    int c[BLOCKS_PerGrid] ;

    unsigned int j;
    for(j=0;j<TOTALN;j++)
    {
    //初始化数组,您可以自己填写数据,我这里用1
    a[j]=1;
    }

    // 进行并行求和
    cudaError_t cudaStatus = addWithCuda(c, a);

    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addWithCuda failed!");
    return 1;
    }

    unsigned int sum1,sum2;
    sum1=0;
    for(j=0;j<BLOCKS_PerGrid;j++)
    {
    sum1 +=c[j];
    }
    //用CPU验证和是否正确

    sum2=0;
    for(j=0;j<TOTALN;j++)
    {
    sum2 += a[j];
    }

    printf("sum1=%d; sum2=%d ",sum1,sum2);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
    }

    return 0;
    }

    // Helper function for using CUDA to add vectors in parallel.

    cudaError_t addWithCuda(int *c, int *a)
    {
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
    goto Error;
    }

    // 申请一个GPU内存空间,长度和main函数中c数组一样
    cudaStatus = cudaMalloc((void**)&dev_c, BLOCKS_PerGrid * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
    }
    // 申请一个GPU内存空间,长度和main函数中a数组一样
    cudaStatus = cudaMalloc((void**)&dev_a, TOTALN * sizeof(int));
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
    }

    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    // Copy input vectors from host memory to GPU buffers.
    //将a的数据从cpu中复制到GPU中
    cudaStatus = cudaMemcpy(dev_a, a, TOTALN * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
    }


    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////
    //////////////////////////////////////////////////


    // Launch a kernel on the GPU with one thread for each element.
    //启动GPU上的每个单元的线程
    SumArray<<<BLOCKS_PerGrid, THREADS_PerBlock>>>(dev_c, dev_a);//, dev_b);

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    //等待全部线程运行结束
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel! ", cudaStatus);
    goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, BLOCKS_PerGrid * sizeof(int), cudaMemcpyDeviceToHost);
    //cudaStatus = cudaMemcpy(b, dev_b, size * sizeof(int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
    }

    Error:
    cudaFree(dev_c);
    cudaFree(dev_a);


    return cudaStatus;
    }

    www.shuleikeji.com
  • 相关阅读:
    【已解决】github中git push origin master出错:error: failed to push some refs to
    好记心不如烂笔头,ssh登录 The authenticity of host 192.168.0.xxx can't be established. 的问题
    THINKPHP 5.0目录结构
    thinkphp5.0入口文件
    thinkphp5.0 生命周期
    thinkphp5.0 架构
    Django template
    Django queryset
    Django model
    Python unittest
  • 原文地址:https://www.cnblogs.com/dongchunxiao/p/4854751.html
Copyright © 2011-2022 走看看