zoukankan      html  css  js  c++  java
  • cuda中当数组数大于线程数的处理方法

    参考stackoverflow一篇帖子的处理方法:https://stackoverflow.com/questions/26913683/different-way-to-index-threads-in-cuda-c

    代码中cuda_gridsize函数参考yolo。

    代码如下:

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <iostream>
    #include <ctime>
    
    using namespace std;
    #define BLOCK 512
    
    dim3 cuda_gridsize(size_t n){
        size_t k = (n - 1) / BLOCK + 1;
        unsigned int x = k;
        unsigned int y = 1;
        if (x > 65535){
            x = ceil(sqrt(k));
            y = (n - 1) / (x*BLOCK) + 1;
        }
        dim3 d = { x, y, 1 };
        //printf("%ld %ld %ld %ld
    ", n, x, y, x*y*BLOCK);
        return d;
    }
    
    __global__ void gpuCalc(unsigned char *img,long H,long W)
    {
        long threadId_2D = threadIdx.x + threadIdx.y*blockDim.x;
        long blockId_2D = blockIdx.x + blockIdx.y*gridDim.x;
        long i = threadId_2D + (blockDim.x*blockDim.y)*blockId_2D;
        
        //另一种索引方式
        //long i = (gridDim.x*blockDim.x)*(threadIdx.y + blockDim.y*blockIdx.y) + (threadIdx.x + blockDim.x*blockIdx.x);
    
        while (i < H*W){
            img[i] = 255 - img[i];
            i += (gridDim.x*blockDim.x)*(gridDim.y*blockDim.y);
        }
    }
    
    void addWithCuda(unsigned char *img, long H,long W)
    {
        unsigned char *dev_a = 0;
    
        cudaSetDevice(0);
    
        cudaMalloc((void**)&dev_a, H*W * sizeof(unsigned char));
        cudaMemcpy(dev_a, img, H*W * sizeof(unsigned char), cudaMemcpyHostToDevice);
    
        gpuCalc<<<cuda_gridsize(H*W),BLOCK>> >(dev_a, H, W);
    
        cudaMemcpy(img, dev_a, H*W * sizeof(unsigned char), cudaMemcpyDeviceToHost);
        cudaFree(dev_a);
    
        cudaGetLastError();
    }
    
    void cpuCalc(unsigned char *img,long W, long H)
    {
        for (long i = 0; i < H*W; i++)
            img[i] = 255 - img[i];
    }
    
    int main()
    {
        long W = 20000;
        long H = 20000;
    
        unsigned char *img = new unsigned char[W*H];
        unsigned char *cmp = new unsigned char[W*H];
    
        for (long i = 0; i < H*W; i++)
            img[i] = rand() % 100;
    
        memcpy(cmp, img, H*W);
    
        cpuCalc(img, W, H);
        printf("cpu calc end
    ");
    
        addWithCuda(img, W,H);
        printf("gpu calc end
    ");
    
        bool flag = true;
        for (long i = 0; i < H*W; i++)
        {
            if (img[i] != cmp[i])
            {
                printf("no pass
    ");
                flag = false;
                break;
            }
        }
        if (flag)
            printf("pass");
    
        delete[] cmp;
        delete[] img;
        getchar();
    
        return 0;
    }
  • 相关阅读:
    2021/6/28
    2021/6/25
    IDEA快捷键
    maven的一些问题
    Maven安装
    2021/6/14
    nmcli 网络管理工具
    linux yum仓库配置
    linux 防火墙selinux ,firewalld, iptables
    linux root密码重置
  • 原文地址:https://www.cnblogs.com/tiandsp/p/9460130.html
Copyright © 2011-2022 走看看