zoukankan      html  css  js  c++  java
  • cuda yv12_to_rgb24

    前言

      项目需要将yv12转rgb24,由于基于x86平台,开始就没多想,直接用ipp加速实现了,后来在评估项目瓶颈的时候发现,1080p的视频每一帧转换居然要花8ms,刚好项目里有用到nvidia gtx960,因此就产生了直接用cuda实现一个yv12转rgb24的想法。

    具体实施

      我一向不喜欢造轮子,因此,第一步就是搜索有没有现成的代码。搜索了很久,包括opencv里都没找到yv12 to rgb24的,还好网上找到了一篇yv12 to argb的,我拿过来照着改改就ok了(包括代码风格及bug修复)。下面直接贴出代码,有任何疑问,可以留言讨论

    #include "cuda.h"
    #include "cuda_runtime.h"
    #include "cuda_runtime_api.h"
    #include <stdio.h>
    
    #define COLOR_COMPONENT_BIT_SIZE 10
    #define COLOR_COMPONENT_MASK     0x3FF
    
    __constant__ float const_hue_colorspace_mat[9]={1.1644f,0.0f,1.596f,1.1644f,-0.3918f,-0.813f,1.1644f,2.0172f,0.0f};
    
    __device__ static void yuv2rgb(const int *yuvi, float *red, float *green,float *blue)
    {
        float luma, chromacb, chromacr;
    
        // Prepare for hue adjustment
        luma     =(float)yuvi[0];
        chromacb =(float)((int)yuvi[1]-512.0f);
        chromacr =(float)((int)yuvi[2]-512.0f);
    
       // Convert YUV To RGB with hue adjustment
       *red   = (luma     * const_hue_colorspace_mat[0])+
                (chromacb * const_hue_colorspace_mat[1])+
                (chromacr * const_hue_colorspace_mat[2]);
    
       *green = (luma     * const_hue_colorspace_mat[3])+
                (chromacb * const_hue_colorspace_mat[4])+
                (chromacr * const_hue_colorspace_mat[5]);
    
       *blue  = (luma     * const_hue_colorspace_mat[6])+
                (chromacb * const_hue_colorspace_mat[7])+
                (chromacr * const_hue_colorspace_mat[8]);
    }
    
    __global__ void yv12torgb24_fourpixel(const unsigned char *src, unsigned char *dst, int width, int height, int dst_pitch)
    {
        // Pad borders with duplicate pixels, and we multiply by 2 because we process 4 pixels per thread
        const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
        const int y = blockIdx.y * (blockDim.y << 1) + (threadIdx.y << 1);
    
        if((x + 1) >= width ||(y + 1) >= height)
           return;
    
        // Read 4 Luma components at a time
        int yuv101010Pel[4];
        yuv101010Pel[0] = (src[y * width + x]) << 2;
        yuv101010Pel[1] = (src[y * width + x + 1]) << 2;
        yuv101010Pel[2] = (src[(y + 1)* width + x]) << 2;
        yuv101010Pel[3] = (src[(y + 1)* width + x + 1]) << 2;
    
        const unsigned int voffset = width * height;
        const unsigned int uoffset = voffset + (voffset >> 2);
        const unsigned int vpitch = width >> 1;
        const unsigned int upitch = vpitch;
        const int x_chroma = x >> 1;
        const int y_chroma = y >> 1;
    
        int chromaCb = src[uoffset + y_chroma * upitch + x_chroma];      //U
        int chromaCr = src[voffset + y_chroma * vpitch + x_chroma];      //V
    
        yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        yuv101010Pel[2] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[2] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
        yuv101010Pel[3] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[3] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    
        // this steps performs the color conversion
        int yuvi[12];
        float red[4], green[4], blue[4];
    
        yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
        yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
        yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
    
        yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
        yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
        yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
    
        yuvi[6] = (yuv101010Pel[2] & COLOR_COMPONENT_MASK);
        yuvi[7] = ((yuv101010Pel[2] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
        yuvi[8] = ((yuv101010Pel[2] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
    
        yuvi[9] = (yuv101010Pel[3] & COLOR_COMPONENT_MASK);
        yuvi[10] = ((yuv101010Pel[3] >> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);
        yuvi[11] = ((yuv101010Pel[3] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
    
        // YUV to RGB Transformation conversion
        yuv2rgb(&yuvi[0], &red[0], &green[0], &blue[0]);
        yuv2rgb(&yuvi[3], &red[1], &green[1], &blue[1]);
        yuv2rgb(&yuvi[6], &red[2], &green[2], &blue[2]);
        yuv2rgb(&yuvi[9], &red[3], &green[3], &blue[3]);
    
        float _red, _green, _blue;
    
        _red   =::fmin(::fmax(red[0], 0.0f), 1023.f);
        _green =::fmin(::fmax(green[0], 0.0f), 1023.f);
        _blue  =::fmin(::fmax(blue[0], 0.0f), 1023.f);
    
        dst[y * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
        dst[y * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
        dst[y * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
    
        _red   =::fmin(::fmax(red[1], 0.0f), 1023.f);
        _green =::fmin(::fmax(green[1], 0.0f), 1023.f);
        _blue  =::fmin(::fmax(blue[1], 0.0f), 1023.f);
    
        dst[y * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
        dst[y * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
        dst[y * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
    
        _red   =::fmin(::fmax(red[2], 0.0f), 1023.f);
        _green =::fmin(::fmax(green[2], 0.0f), 1023.f);
        _blue  =::fmin(::fmax(blue[2], 0.0f), 1023.f);
    
        dst[(y+1) * dst_pitch + x*3 + 0] = (((unsigned int)_blue) & 0x3ff) >> 2;
        dst[(y+1) * dst_pitch + x*3 + 1] = (((unsigned int)_green) & 0x3ff) >> 2;
        dst[(y+1) * dst_pitch + x*3 + 2] = (((unsigned int)_red) & 0x3ff) >> 2;
    
        _red   =::fmin(::fmax(red[3], 0.0f), 1023.f);
        _green =::fmin(::fmax(green[3], 0.0f), 1023.f);
        _blue  =::fmin(::fmax(blue[3], 0.0f), 1023.f);
    
        dst[(y+1) * dst_pitch + x*3 + 3] = (((unsigned int)_blue) & 0x3ff) >> 2;
        dst[(y+1) * dst_pitch + x*3 + 4] = (((unsigned int)_green) & 0x3ff) >> 2;
        dst[(y+1) * dst_pitch + x*3 + 5] = (((unsigned int)_red) & 0x3ff) >> 2;
    }
    
    bool yv12_to_rgb24(unsigned char *src, unsigned char *dst,int src_width,int src_height, int dst_pitch)
    {
        unsigned char *d_src;
        unsigned int src_mem_size = sizeof(unsigned char ) * src_width * src_height * 3/2;
    
        dim3 block(32,8);
        int gridx = (src_width +2*block.x -1)/(2*block.x);
        int gridy = (src_height +2*block.y -1)/(2*block.y);
        dim3 grid(gridx, gridy);
    
        cudaMalloc((void**)&d_src,src_mem_size);
        cudaMemcpy(d_src, src, src_mem_size, cudaMemcpyHostToDevice);
    
    
        yv12torgb24_fourpixel<<<grid,block>>>(d_src, dst, src_width, src_height, dst_pitch);
        cudaFree(d_src);
    
        return true;
    }      
    

    总结

    经过cuda加速后的转换能够在1ms左右完成,还是比较理想的_

    完!
    2016年8月

  • 相关阅读:
    python3下搜狗AI API实现
    辽宁移动宽带体验及魔百盒M101s-2刷机
    年会抽奖系统-支持作弊,指定中奖
    论python3下“多态”与“继承”中坑
    fiddler抓手机报文的配置指南
    微软BI套件
    IntelliJ IDEA 2018 Community(社区版)创建J2EE项目+Tomcat9部署
    Spring.net的Demo项目,了解什么是控制反转
    manjaro安装virtualbox教程
    Debian9 安装后的配置笔记
  • 原文地址:https://www.cnblogs.com/rongpmcu/p/7662802.html
Copyright © 2011-2022 走看看