zoukankan      html  css  js  c++  java
  • YOLOv3处理图片优化——cuda bilinear resize

    YOLOv3中处理一张1080P的图片,resize到输入416*416尺寸,调用内部接口做cpu resize,可能80%~90%的时间耗在图像解码、resize上,对比推理时间耗时严重。尝试用cuda做外部resize。
    修改下工程用于Ubuntu16.04,1080ti显卡,提供个包其中需要cmakelist修改下opencv路径。


    下载:https://pan.baidu.com/s/10RC1Lvxt4FFg5bsbrtnX8w

    resizeGPU.cu

     
    #include "resizeGPU.cuh"
    //#define _DEBUG
     
    #define BLOCK_DIM 64
    #define threadNum 1024
    #define WARP_SIZE 32
    #define elemsPerThread 1
     
    int32_t* deviceDataResized; //отмасштабированное изображение в памяти GPU
    int32_t* deviceData; //оригинальное изображение в памяти GPU
    int32_t* hostOriginalImage;
    int32_t* hostResizedImage;
     
    void reAllocPinned(int w, int h, int w2, int h2, int32_t* dataSource)
    {
        cudaMallocHost((void**)&hostOriginalImage, w*h* sizeof(int32_t)); // host pinned
        cudaMallocHost((void**)&hostResizedImage, w2*h2 * sizeof(int32_t)); // host pinned
        memcpy(hostOriginalImage, dataSource, w*h * sizeof(int32_t));
     
        return;
    }
     
    void freePinned()
    {
        cudaFreeHost(hostOriginalImage);
        cudaFreeHost(hostResizedImage);
     
        return;
    }
     
    void initGPU(const int maxResolutionX, const int maxResolutionY)
    {
        cudaMalloc((void**)&deviceDataResized, maxResolutionX*maxResolutionY * sizeof(int32_t));
        cudaMalloc((void**)&deviceData, maxResolutionX*maxResolutionY * sizeof(int32_t));
     
        return;
    }
     
    void deinitGPU()
    {
        cudaFree(deviceData);
        cudaFree(deviceDataResized);
     
        return;
    }
     
    __global__ void SomeKernel(int32_t* originalImage, int32_t* resizedImage, int w, int h, int w2, int h2/*, float x_ratio, float y_ratio*/)
    {
        __shared__ int32_t tile[1024];
        const float x_ratio = ((float)(w - 1)) / w2;
        const float y_ratio = ((float)(h - 1)) / h2;
        //const int blockbx = blockIdx.y * w2 + blockIdx.x*BLOCK_DIM;
        //unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x;
        unsigned int threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
        //__shared__ float result[threadNum*elemsPerThread];
        unsigned int shift = 0;
        //int32_t a, b, c, d, x, y, index;
        while((threadId < w2*h2 && shift<elemsPerThread))
        {
            const int32_t i = threadId / w2;
            const int32_t j = threadId - (i*w2);
            //float x_diff, y_diff, blue, red, green;
            
            const int32_t x = (int)(x_ratio * j);
            const int32_t y = (int)(y_ratio * i);
            const float x_diff = (x_ratio * j) - x;
            const float y_diff = (y_ratio * i) - y;
            const int32_t index = (y*w + x);
            const int32_t a = originalImage[index];
            const int32_t b = originalImage[index + 1];
            const int32_t c = originalImage[index + w];
            const int32_t d = originalImage[index + w + 1];
            // blue element
            // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
            const float blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
                (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
     
            // green element
            // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
            const float green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
                ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
     
            // red element
            // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
            const float red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
                ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
     
            /*
            resizedImage[threadId] =
                0xff000000 |
                ((((int32_t)red) << 16) & 0xff0000) |
                ((((int32_t)green) << 8) & 0xff00) |
                ((int32_t)blue);
            */
            tile[threadIdx.x] =
                0xff000000 |
                ((((int32_t)red) << 16) & 0xff0000) |
                ((((int32_t)green) << 8) & 0xff00) |
                ((int32_t)blue);
     
            threadId++;
            //threadId+= WARP_SIZE;
            shift++;
        }
        
        __syncthreads();
        threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread;
        resizedImage[threadId] = tile[threadIdx.x];
        /*
        shift--;
        threadId = blockIdx.x * threadNum*elemsPerThread + threadIdx.x*elemsPerThread+ shift;
        while (shift >= 0)
        {
            resizedImage[threadId] = tile[shift];
            shift--;
            threadId--;
        }
        */
    }
     
     
     
    int32_t* resizeBilinear_gpu(int w, int h, int w2, int h2)
    {
    #ifdef _DEBUG
        cudaError_t error; //store cuda error codes
    #endif
        int length = w2 * h2;
     
        // Копирование исходных данных в GPU для обработки
        cudaMemcpy(deviceData, hostOriginalImage, w*h * sizeof(int32_t), cudaMemcpyHostToDevice);
        //cudaMemcpy2D(deviceData, w * sizeof(int32_t), hostOriginalImage, w * sizeof(int32_t), w * sizeof(int32_t), h, cudaMemcpyHostToDevice);
        //error = cudaMemcpyToSymbol(deviceData, pixels, w*h * sizeof(int32_t),0, cudaMemcpyHostToDevice);
    #ifdef _DEBUG
        if (error != cudaSuccess)
        {
            printf("cudaMemcpy (pixels->deviceData), returned error %s (code %d), line(%d)
    ", cudaGetErrorString(error), error, __LINE__);
            exit(EXIT_FAILURE);
        }
    #endif
     
        dim3 threads = dim3(threadNum, 1,1); //block size 32,32,x
        dim3 blocks = dim3(w2*h2/ threadNum*elemsPerThread, 1,1);
        //printf("Blockdim.x %d
    ", blocks.x);
        //printf("thrdim.x %d
    ", threads.x);
     
        // Запуск ядра из (length / 256) блоков по 256 потоков,
        // предполагая, что length кратно 256
        SomeKernel << <blocks, threads >> >(deviceData, deviceDataResized, w, h, w2, h2/*, x_ratio, y_ratio*/);
     
     
        cudaDeviceSynchronize();
        // Считывание результата из GPU
        cudaMemcpy(hostResizedImage, deviceDataResized, length * sizeof(int32_t), cudaMemcpyDeviceToHost);
     
        return hostResizedImage;
    }

    converter.cpp

    #include "converter.hpp"
     
    int32_t* cvtMat2Int32(const cv::Mat& srcImage)
    {
        int32_t *result = new int32_t[srcImage.cols*srcImage.rows];
        int offset = 0;
     
        for (int i = 0; i<srcImage.cols*srcImage.rows * 3; i += 3)
        {
            int32_t blue = srcImage.data[i];
            int32_t green = srcImage.data[i + 1];
            int32_t red = srcImage.data[i + 2];
            result[offset++] =
                0xff000000 |
                ((((int32_t)red) << 16) & 0xff0000) |
                ((((int32_t)green) << 8) & 0xff00) |
                ((int32_t)blue);
        }
     
        return result;
    }
     
    void cvtInt322Mat(int32_t *pxArray, cv::Mat& outImage)
    {
        int offset = 0;
        for (int i = 0; i<outImage.cols*outImage.rows * 3; i += 3)
        {
            int32_t a = pxArray[offset++];
            int32_t blue = a & 0xff;
            int32_t green = ((a >> 8) & 0xff);
            int32_t red = ((a >> 16) & 0xff);
            outImage.data[i] = blue;
            outImage.data[i + 1] = green;
            outImage.data[i + 2] = red;
        }
        return;
    }

    resizeCPU.cpp

     
    #include "resizeCPU.hpp"
     
    int* resizeBilinear_cpu(int32_t* pixels, int w, int h, int w2, int h2)
    {
        int32_t* temp = new int32_t[w2*h2];
        int32_t a, b, c, d, x, y, index;
        float x_ratio = ((float)(w - 1)) / w2;
        float y_ratio = ((float)(h - 1)) / h2;
        float x_diff, y_diff, blue, red, green;
        int offset = 0;
        for (int i = 0; i<h2; i++)
        {
            for (int j = 0; j<w2; j++)
            {
                x = (int)(x_ratio * j);
                y = (int)(y_ratio * i);
                x_diff = (x_ratio * j) - x;
                y_diff = (y_ratio * i) - y;
                index = (y*w + x);
                a = pixels[index];
                b = pixels[index + 1];
                c = pixels[index + w];
                d = pixels[index + w + 1];
     
                // blue element
                // Yb = Ab(1-w)(1-h) + Bb(w)(1-h) + Cb(h)(1-w) + Db(wh)
                blue = (a & 0xff)*(1 - x_diff)*(1 - y_diff) + (b & 0xff)*(x_diff)*(1 - y_diff) +
                    (c & 0xff)*(y_diff)*(1 - x_diff) + (d & 0xff)*(x_diff*y_diff);
     
                // green element
                // Yg = Ag(1-w)(1-h) + Bg(w)(1-h) + Cg(h)(1-w) + Dg(wh)
                green = ((a >> 8) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 8) & 0xff)*(x_diff)*(1 - y_diff) +
                    ((c >> 8) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 8) & 0xff)*(x_diff*y_diff);
     
                // red element
                // Yr = Ar(1-w)(1-h) + Br(w)(1-h) + Cr(h)(1-w) + Dr(wh)
                red = ((a >> 16) & 0xff)*(1 - x_diff)*(1 - y_diff) + ((b >> 16) & 0xff)*(x_diff)*(1 - y_diff) +
                    ((c >> 16) & 0xff)*(y_diff)*(1 - x_diff) + ((d >> 16) & 0xff)*(x_diff*y_diff);
     
                temp[offset++] =
                    0xff000000 |
                    ((((int32_t)red) << 16) & 0xff0000) |
                    ((((int32_t)green) << 8) & 0xff00) |
                    ((int32_t)blue);
            }
        }
        return temp;
    }

    对比下结果,在1080ti下,resize 1080P图片到416*416尺寸,cuda resize 1.6ms,cpu resize 3.8ms,

    darknet内部接口cpu resize 8.0ms。

    cpu resize相比darknet resize 接口主要是移位操作有提速,cuda resize处理时间减少很多,但是需要做数据类型Mat与Int32相互转换。

    酒是穿肠毒药,色是刮骨钢刀,财是惹祸根苗,气是雷烟火炮。 不过,无酒毕竟不成席,无色世上人渐稀,无财何人早早起,无气处处惹人欺。 饮酒不醉量为高,见色不迷真英豪,不义之财君莫取,忍气饶人祸自消。 酒色财气四堵墙,人人都在里边藏,谁若跳到墙外边,不是神仙也寿长。 君听我一言:做人,量体裁衣。
  • 相关阅读:
    关于Python装饰器内层函数为什么要return目标函数的一些个人见解
    多项式拟合与线性回归
    numpy基本方法总结 --good
    numpy中的convolve的理解
    最容易理解的对卷积(convolution)的解释
    Python之numpy基本指令
    线性回归原理小结
    矩阵的导数与迹
    【MyBatis学习14】MyBatis和Spring整合
    【MyBatis学习13】MyBatis中的二级缓存
  • 原文地址:https://www.cnblogs.com/laosan007/p/12612315.html
Copyright © 2011-2022 走看看