zoukankan      html  css  js  c++  java
  • CUDA 实例练习(二)

    题目:多项式f(x)=x + x^2 + x^3 + ···+x^n求和。

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <math.h>
    #include <stdio.h>
    #include <gputimer.h>
    
    __global__ void polynomial_items(float * array, float x, int n)
    {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        //float sum = 0.0f;
    
        if (idx < n) {
            array[idx] = pow(x, idx + 1);
        }
        __syncthreads();
    
        //for (int i = 0; i <= idx; i++){
        //sum += array[idx];
        //}
    }
    
    __global__ void shmem_reduce_kernel(float * d_out, const float * d_in)
    {
        extern __shared__ float sdata[];
    
        int myId = threadIdx.x + blockIdx.x * blockDim.x;
        int tid = threadIdx.x;
        // load shared mem from global mem
        sdata[tid] = d_in[myId];
        __syncthreads();
        // do reduction in shared mem
        for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
        {
            if (tid < s)
            {
                sdata[tid] += sdata[tid + s];
            }
            __syncthreads();// make sure all adds at one stage are done!
        }
        
        if (tid == 0)// only thread 0 writes result for this block back to global mem
        {
            d_out[blockIdx.x] = sdata[0];
        }
    }
    
    void reduce(float * d_out, float * d_intermediate, float * d_in,int size)
    {
        // assumes that size is not greater than maxThreadsPerBlock^2 
        // and that size is a multiple of maxThreadsPerBlock 
        const int maxThreadsPerBlock = 1024;
        int threads = maxThreadsPerBlock;
        int blocks = size / maxThreadsPerBlock + 1;
        shmem_reduce_kernel << <blocks, threads, threads * sizeof(float) >> >(
            d_intermediate, d_in);
    
        // now we're down to one block left, so reduce it
        threads = blocks;// launch one thread for each block in prev step
        blocks = 1;
        shmem_reduce_kernel << <blocks, threads, threads*sizeof(float) >> >(
            d_out, d_intermediate);
    }
    int main()
    {
        GpuTimer timer;
        int n;
        float x;
        float sum = 0.0f;
        printf("n is ");
        scanf("%d", &n);
        printf("x is ");
        scanf("%f", &x);
        float * h_in;
        h_in = (float *)malloc(sizeof(float)*n);
    
        float * d_array;
        const int array_bytes = n* sizeof(float);
        cudaMalloc((void **)&d_array, array_bytes);
        float *d_in, *d_intermediate, *d_out;
        cudaMalloc((void **)&d_in, array_bytes);
        cudaMalloc((void **)&d_intermediate, array_bytes);
        cudaMalloc((void **)&d_out, sizeof(float));
        float h_out;
    
        timer.Start();
        polynomial_items << <n / 1024 + 1, 1024 >> >(d_array, x, n);
        cudaMemcpy(h_in, d_array, array_bytes, cudaMemcpyDeviceToHost);
        cudaMemcpy(d_in, h_in, array_bytes, cudaMemcpyHostToDevice);
        reduce(d_out, d_intermediate, d_in, n);
        //for (int i = 0; i <n; i++){
            //sum += h_in[i];
        //}
        timer.Stop();
        cudaMemcpy(&h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
        for (int i = 0; i < n; i++){
            printf("%f ", h_in[i]);
        }
        printf("
    Time elapsed = %g ms
    ", timer.Elapsed());
    
        //printf("sum is %lf
    ", sum);
        printf("h_out is %lf
    ", h_out);
    
        cudaFree(d_array);
        cudaFree(d_in);
        cudaFree(d_intermediate);
        cudaFree(d_out);
    
        return 0;
    
    }
  • 相关阅读:
    sql server 中各个系统表的作用==== (转载)
    后台动态设置前台标签内容和属性
    利用C#编写一个简单的抓网页应用程序
    如何创建和使用Web Service代理类
    jdbc如何取得存储过程return返回值
    子窗口和父窗口的函数或对象能否相互访问 (转载)
    把aspx文件编译成DLL文件
    C#中的类型转换
    c#中对文件的操作小结
    转贴一篇 自定义数据库 希望对你有帮助
  • 原文地址:https://www.cnblogs.com/zhangshuwen/p/7153011.html
Copyright © 2011-2022 走看看