zoukankan      html  css  js  c++  java
  • GPU Tips

    <1> Basic

    #include <stdio.h>
    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #define NUM 15
    __global__ void square(float *dout,float *din)
    {
        int idx = threadIdx.x;
        float f  = din[idx];
        dout[idx] = f*f;
    }
    
    
    int main(int argc,char **argv)
    {
            
        const int bytes = sizeof(float) * NUM;
        float host_in[NUM];
        // save some value
        for(int i=0;i<NUM;i++)
        {
            host_in[i] = float(i);
        }
    
        float host_out[NUM];
    
        cudaError_t cudaStatus;
        // GPU SETTINGS
        // Choose which GPU to run on, change this on a multi-GPU system.
        cudaStatus = cudaSetDevice(0);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
            return;
        }
    
    
    
    
    
        // define gpu memory, GPU memory allocation
        float *device_in =  0;
        float *device_out = 0;
        cudaStatus = cudaMalloc((void**)&device_in, bytes);
        cudaStatus = cudaMalloc((void**)&device_out,bytes);
    
        cudaStatus = cudaMemcpy(device_in,host_in,bytes,cudaMemcpyHostToDevice);
    
    
    
        // GPU kernel
        // 1 block,Num threads
        square<<<1,NUM>>>(device_out,device_in);
    
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!
    ", cudaStatus);
        }
    
    
        cudaStatus = cudaMemcpy(host_out, device_out, bytes, cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
        }
    
    
        // Free GPU memory
        cudaFree(device_in);
        cudaFree(device_out);
    
        for(int i=0;i<NUM;i++)
        {
            fprintf(stdout,"%f 
    ",host_out[i]);
        }
    
        getchar();
    
        return 0;
    
    }
    View Code

     <2> N blocks and block's threads one dim

    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <stdio.h>
    #include <stdlib.h>
    #define ARRAYSize 50000000
    #define THREADS_PER_BLOCK 1024
    
    #define fnvalue(a,size)
    {
        for(int i=0;i<size;i++)    
        {
           a[i] = float(i);
        }
    }
    
    #define CHECK_CUDA_STATUS(STATUS)
    {
        if (STATUS != cudaSuccess)
        {
            fprintf(stdout,"Error in line %d
     ",__LINE__);
        }
    }
    
    
    
    __global__ void add(float *d_out,float *d_x, float *d_y)
    {
        
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        if (index<ARRAYSize)
        {
            d_out[index] = d_x[index] + d_y[index];
        }
        
    }
    
    int main(int argc,char **argv)
    {
    
        const int bytes = sizeof(float)*ARRAYSize;
    
        // host memory
        float *h_x   = (float*)malloc(bytes);
        float *h_y   = (float*)malloc(bytes);
        float *h_out = (float*)malloc(bytes);
    
        // give host value
        fnvalue(h_x,ARRAYSize);
        fnvalue(h_y,ARRAYSize);
    
        // device memory
        float *d_x,*d_y,*d_out;
        // cuda setttings
        cudaError_t dstat;
        dstat = cudaSetDevice(0);
        CHECK_CUDA_STATUS(dstat);
        dstat = cudaMalloc((void**)&d_x, bytes);
        CHECK_CUDA_STATUS(dstat);
        dstat = cudaMalloc((void**)&d_y, bytes);
        CHECK_CUDA_STATUS(dstat);
        dstat = cudaMalloc((void**)&d_out, bytes);
        CHECK_CUDA_STATUS(dstat);
    
    
        fprintf(stdout,"Copy data go GPU
    ");
        cudaMemcpy(d_x,h_x,bytes,cudaMemcpyHostToDevice);
        cudaMemcpy(d_y,h_y,bytes,cudaMemcpyHostToDevice);
    
    
        add<<<ARRAYSize/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_out,d_x,d_y);
    
    
        fprintf(stdout,"Copy GPU data to cpu
    ");
        dstat = cudaMemcpy(h_out,d_out,bytes,cudaMemcpyDeviceToHost);
    
        cudaDeviceSynchronize();
    
        // DEBUG SOME VALUE
        
        for(int i=100600;i<100900;i++)
        {
            if ((i+1)%4==0)
            {
                fprintf(stdout,"%f
    ", h_out[i]);
            }
            else
            {
                fprintf(stdout,"%f ", h_out[i]);
            }
        }
    
        getchar();
    
    
    
        // FREE CPU MEMORY
        free(h_x);
        free(h_y);
        free(h_out);
    
        // FREE GPU MEMORY
        dstat = cudaFree(d_x);
        CHECK_CUDA_STATUS(dstat);
        dstat = cudaFree(d_y);
        CHECK_CUDA_STATUS(dstat);
        dstat = cudaFree(d_out);
        CHECK_CUDA_STATUS(dstat);
    
        return 0;
        
    
    }
    View Code

    <3> Unified memory:

    #include <iostream>
    #include <math.h>
    // Kernel function to add the elements of two arrays
    __global__
    void add(int n, float *x, float *y)
    {
      for (int i = 0; i < n; i++)
        y[i] = x[i] + y[i];
    }
    
    int main(void)
    {
      int N = 1<<20;
      float *x, *y;
    
      // Allocate Unified Memory – accessible from CPU or GPU
      cudaMallocManaged(&x, N*sizeof(float));
      cudaMallocManaged(&y, N*sizeof(float));
    
      // initialize x and y arrays on the host
      for (int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
      }
    
      // Run kernel on 1M elements on the GPU
      add<<<1, 1>>>(N, x, y);
    
      // Wait for GPU to finish before accessing on host
      cudaDeviceSynchronize();
    
      // Check for errors (all values should be 3.0f)
      float maxError = 0.0f;
      for (int i = 0; i < N; i++)
        maxError = fmax(maxError, fabs(y[i]-3.0f));
      std::cout << "Max error: " << maxError << std::endl;
    
      // Free memory
      cudaFree(x);
      cudaFree(y);
      
      return 0;
    }
    View Code

    <4>Some tips

    (1)

    下图表示一维的block是由grid生成的。

    __global__
    void add(int n, float *x, float *y)
    {
      int index = blockIdx.x * blockDim.x + threadIdx.x;
      int stride = blockDim.x * gridDim.x;
      for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
    }

    (2)  关于SharedMemory ,其实是在一个block上的共享memory

     

     code:

    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <device_functions.h>
    
    #define RADIUS 3
    #define BLOCKSIZE 10
    
    __global__ void process(int *d_out,int *d_in,int *shared_mem)
    {
        __shared__ int temp[BLOCKSIZE + 2* RADIUS ];
        int gindex = threadIdx.x + blockIdx.x * blockDim.x;
        int lindex = threadIdx.x + RADIUS;
        //printf("%d ",lindex);
        // Read input elements into shared memory
        temp[lindex] = d_in[gindex];
    
    
        
        
        if (threadIdx.x < RADIUS) 
        {
            temp[lindex - RADIUS] = d_in[gindex - RADIUS];
            temp[lindex + BLOCKSIZE] = d_in[gindex + BLOCKSIZE];
    
        }
    
    
        shared_mem[lindex] = lindex;
        // this code for debug
        
        
    
    
        __syncthreads();
        
    
        // Apply the stencil
         int result = 0;
         for (int offset = -RADIUS ; offset <= RADIUS ; offset++)
         {
             result += temp[lindex + offset];
             
         }
            
         // Store the result
         d_out[gindex] = result;
         
    }
    
    
    
    int main(int argc,char**argv)
    {
        // allocation of memory
        
        int host_rawSize = 10;
    
        int host_bytes = sizeof(int) * host_rawSize;
        int shared_bytes = (host_rawSize+2*RADIUS) * sizeof(int);
    
    
        int *host_data          = (int*)malloc(host_bytes);
        int *host_outData       = (int*)malloc(host_bytes);
        int *host_sharedMemData = (int*)malloc(shared_bytes);
        for(int i=0;i<host_rawSize;i++)
        {
            host_data[i] = int(i)+1;
        }
        for(int i=0;i<host_rawSize;i++)
        {
            fprintf(stdout,"%d   ",host_data[i]);
        }
        fprintf(stdout,"
    ");
    
    
    
        int *dev_in;
        cudaMallocManaged((void**)&dev_in  , host_bytes);
        //cudaMallocManaged(&dev_in  , host_bytes);
        //cudaMalloc((void**)&dev_rawdata,bytes);
        cudaMemcpy(dev_in,host_data,host_bytes,cudaMemcpyHostToDevice);
    
    
    
        int dev_out_bytes  = host_rawSize *sizeof(int);  // 4*sizeof(float)
        int *dev_out;
        int *dev_shared;
        cudaMallocManaged(&dev_out    , dev_out_bytes);
        cudaMallocManaged(&dev_shared , shared_bytes);
    
        process<<<1,host_rawSize>>>(dev_out,dev_in,dev_shared);
    
        cudaMemcpy(host_outData,      dev_out,   dev_out_bytes,cudaMemcpyDeviceToHost);
        cudaMemcpy(host_sharedMemData,dev_shared,shared_bytes,cudaMemcpyDeviceToHost);
    
    
        printf("===============Debug the gpu shared memory=======================
    ");
        for(int i=0;i<host_rawSize + 2*RADIUS;i++)
        {
            fprintf(stdout,"%d   ",host_sharedMemData[i]);
        }
        printf("
    ===============Debug the gpu shared memory=======================
    ");
    
    
        for(int i=0;i<host_rawSize;i++)
        {
            fprintf(stdout,"%d   ",host_outData[i]);
        }
        fprintf(stdout,"
    ");
    
        getchar();
    
        return 0;
    }
    View Code

    <1>simple caculation:
    I = (R+G+B)/3

    I = R*0.299f + G*0.587f + 0.114f*B

    CPU:

    // Serial implementation for running on CPU using a single thread.
    void rgbaToGreyscaleCpu(const uchar4* const rgbaImage, unsigned char *const greyImage,
            const size_t numRows, const size_t numCols)
    {
        for (size_t r = 0; r < numRows; ++r) {
            for (size_t c = 0; c < numCols; ++c) {
                const uchar4 rgba = rgbaImage[r * numCols + c];
                const float channelSum = .299f * rgba.x + .587f * rgba.y + .114f * rgba.z;
                greyImage[r * numCols + c] = channelSum;
            }
        }
    }

    GPU:

    // CUDA kernel which is run in parallel by many GPU threads.
    __global__
    void rgbaToGreyscaleCudaKernel(const uchar4* const rgbaImage,
            unsigned char* const greyImage,
            const int numRows, const int numCols)
    {
        //First create a mapping from the 2D block and grid locations
        //to an absolute 2D location in the image, then use that to
        //calculate a 1D offset
        const long pointIndex = threadIdx.x + blockDim.x*blockIdx.x;
     
        if(pointIndex<numRows*numCols) { // this is necessary only if too many threads are started
            uchar4 const imagePoint = rgbaImage[pointIndex];
            greyImage[pointIndex] = .299f*imagePoint.x + .587f*imagePoint.y  + .114f*imagePoint.z;
        }
    }
     
    // Parallel implementation for running on GPU using multiple threads.
    void rgbaToGreyscaleCuda(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
            unsigned char* const d_greyImage, const size_t numRows, const size_t numCols)
    {
        const int blockThreadSize = 256;
        const int numberOfBlocks = 1 + ((numRows*numCols - 1) / blockThreadSize); // a/b rounded up
        const dim3 blockSize(blockThreadSize, 1, 1);
        const dim3 gridSize(numberOfBlocks , 1, 1);
        rgbaToGreyscaleCudaKernel<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
    }
  • 相关阅读:
    no route to host
    修改文件失败,提示E509: 无法创建备份文件 (请加 ! 强制执行)
    OAuth2.0的四种授权模式
    Django--JWT认证
    Django--数据库ORM操作
    DRF中限流、过滤、排序、分页
    Django框架中序列化与反序列化与字段验证
    21.Django三种风格的模型继承
    20.第三方微博登录Django+Vue
    博客园图片点击放大功能
  • 原文地址:https://www.cnblogs.com/gearslogy/p/7113009.html
Copyright © 2011-2022 走看看