zoukankan      html  css  js  c++  java
  • CUDA 实例练习(二)

    题目:多项式f(x)=x + x^2 + x^3 + ···+x^n求和。

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <math.h>
    #include <stdio.h>
    #include <gputimer.h>
    
    __global__ void polynomial_items(float * array, float x, int n)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        //float sum = 0.0f;
    
        if (idx < n) {
            array[idx] = pow(x, idx + 1);
        }
        __syncthreads();
    
        //for (int i = 0; i <= idx; i++){
        //sum += array[idx];
        //}
    }
    
    __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
    {
        extern __shared__ float sdata[];
    
        int myId = threadIdx.x + blockIdx.x * blockDim.x;
        int tid = threadIdx.x;
        // load shared mem from global mem
        sdata[tid] = d_in[myId];
        __syncthreads();
        // do reduction in shared mem
        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
        {
            if (tid < s)
            {
                sdata[tid] += sdata[tid + s];
            }
            __syncthreads();// make sure all adds at one stage are done!
        }
        
        if (tid == 0)// only thread 0 writes result for this block back to global mem
        {
            d_out[blockIdx.x] = sdata[0];
        }
    }
    
    void reduce(float * d_out, float * d_intermediate, float * d_in,int size)
    {
        // assumes that size is not greater than maxThreadsPerBlock^2 
        // and that size is a multiple of maxThreadsPerBlock 
        const int maxThreadsPerBlock = 1024;
        int threads = maxThreadsPerBlock;
        int blocks = size / maxThreadsPerBlock + 1;
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >(
            d_intermediate, d_in);
    
        // now we're down to one block left, so reduce it
        threads = blocks;// launch one thread for each block in prev step
        blocks = 1;
        shmem_reduce_kernel << <blocks, threads, threads*sizeof(float) >> >(
            d_out, d_intermediate);
    }
    int main()
    {
        GpuTimer timer;
        int n;
        float x;
        float sum = 0.0f;
        printf("n is ");
        scanf("%d", &n);
        printf("x is ");
        scanf("%f", &x);
        float * h_in;
        h_in = (float *)malloc(sizeof(float)*n);
    
        float * d_array;
        const int array_bytes = n* sizeof(float);
        cudaMalloc((void **)&d_array, array_bytes);
        float *d_in, *d_intermediate, *d_out;
        cudaMalloc((void **)&d_in, array_bytes);
        cudaMalloc((void **)&d_intermediate, array_bytes);
        cudaMalloc((void **)&d_out, sizeof(float));
        float h_out;
    
        timer.Start();
        polynomial_items << <n / 1024 + 1, 1024 >> >(d_array, x, n);
        cudaMemcpy(h_in, d_array, array_bytes, cudaMemcpyDeviceToHost);
        cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice);
        reduce(d_out, d_intermediate, d_in, n);
        //for (int i = 0; i <n; i++){
            //sum += h_in[i];
        //}
        timer.Stop();
        cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
        for (int i = 0; i < n; i++){
            printf("%f ", h_in[i]);
        }
        printf("
    Time elapsed = %g ms
    ", timer.Elapsed());
    
        //printf("sum is %lf
    ", sum);
        printf("h_out is %lf
    ", h_out);
    
        cudaFree(d_array);
        cudaFree(d_in);
        cudaFree(d_intermediate);
        cudaFree(d_out);
    
        return 0;
    
    }
  • 相关阅读:
    add to svn ignore disabled
    Zend 安装 OpenExplorer插件
    Win7系统怎么清理注册表残留?【系统天地】
    永久免费Win10企业版激活密钥大全【系统天地】
    Win7取消自动锁屏的方法步骤【系统天地】
    手把手教你怎么激活windows10专业版【系统天地】
    win8解除网速限制的操作方法【系统天地】
    教你win10系统显卡驱动安装失败的解决方法【系统天地】
    Win7移动硬盘在电脑上不显示怎么办【系统天地】
    Win10运行程序提示不受信任的解决方法【系统天地】
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7153011.html
Copyright © 2011-2022 走看看