zoukankan      html  css  js  c++  java
  • SSE指令集加速之 I420转BGR24

    void yuv420_to_rgb24_sse3(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height,
        uint8_t *rgb, int srgb)
    {
        //定义空间
        __m128i y0r0, y0r1, u0, v0;
        __m128i y00r0, y01r0, y00r1, y01r1;
        __m128i u00, u01, v00, v01;
        __m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
        __m128i r00, r01, g00, g01, b00, b01;
        __m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
        __m128i gbgb;
        __m128i ysub, uvsub;
        __m128i zero, facy, facrv, facgu, facgv, facbu;
        __m128i *srcy128r0, *srcy128r1;
        uint8_t *dstrgbr0, *dstrgbr1;
        __m128i maskrgb;
        __m64   *srcu64, *srcv64;
    
        //定义核,公式定量
        //ysub = 0x0010 ......    8 times
        ysub = _mm_set1_epi16(0x0010);
        uvsub = _mm_set1_epi16(0x0080);
        zero = _mm_set1_epi16(0x0000);
    
        maskrgb = _mm_set_epi8(128, 128, 128, 128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0);
    
        facy = _mm_set1_epi16(0x2543);
        facrv = _mm_set1_epi16(0x3313);
        facgu = _mm_set1_epi16(0xF377);
        facgv = _mm_set1_epi16(0xE5FC);
        facbu = _mm_set1_epi16(0x408D);
    
        for (int y = 0; y < height; y += 2) {
            //源数据指针
            srcy128r0 = (__m128i *)(yp + sy*y);
            srcy128r1 = (__m128i *)(yp + sy*y + sy);
            srcu64 = (__m64 *)(up + suv*(y / 2));
            srcv64 = (__m64 *)(vp + suv*(y / 2));
    
            dstrgbr0 = rgb + srgb*y;
            dstrgbr1 = rgb + srgb*y + srgb;
    
            for (int x = 0; x < width; x += 16) {
                //加载行数据
                u0 = _mm_loadl_epi64((__m128i *)srcu64); srcu64++;  //0000 0000 0000 xxxx <= uuuu
                v0 = _mm_loadl_epi64((__m128i *)srcv64); srcv64++;
    
                y0r0 = _mm_load_si128(srcy128r0++);// xxxx xxxx xxxx xxxx <= yyyy yyyy yyyy yyyy
                y0r1 = _mm_load_si128(srcy128r1++); 
    
                //计算YUV中的Y向量
                //   a = 0y0y 0y0y 0y0y 0y0y
                //   b = a - 0x0010 .... = 0y-0x0010 ;
                //   c = `b<<3   (8 times)
                //   d = (c*0x2543)>>16 ......8times 
                y00r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r0, zero), ysub), 3), facy);
                y01r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r0, zero), ysub), 3), facy);
                y00r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r1, zero), ysub), 3), facy);
                y01r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r1, zero), ysub), 3), facy);
    
                //展开u和v,使它们与y值对齐
                // u0 = [0][u3] [0][u2] ......
                //a = [0][u3][0][u3] ........[0][u0][0][u0]
                //b = [0][u3] - 0x0080 ....    8times 
                //u00 = [0][u3] <<3 ... 8 times;
                //u00 =[00000u3000]
                //u01 = u00;
                u0 = _mm_unpacklo_epi8(u0, zero);
                u00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(u0, u0), uvsub), 3);
                u01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(u0, u0), uvsub), 3);
    
                v0 = _mm_unpacklo_epi8(v0, zero);
                v00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(v0, v0), uvsub), 3);
                v01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(v0, v0), uvsub), 3);
    
                //计算两行UV的向量
                //  short 乘法  然后移位 
                // rv00 其实等于rv01
                rv00 = _mm_mulhi_epi16(facrv, v00);
                rv01 = _mm_mulhi_epi16(facrv, v01);
    
                //([00000u3000]*0xF377)>>8   8 times   overflow??
                gu00 = _mm_mulhi_epi16(facgu, u00);
                gu01 = _mm_mulhi_epi16(facgu, u01);
                gv00 = _mm_mulhi_epi16(facgv, v00);
                gv01 = _mm_mulhi_epi16(facgv, v01);
                bu00 = _mm_mulhi_epi16(facbu, u00);
                bu01 = _mm_mulhi_epi16(facbu, u01);
    
    
                //计算出最后RGB        行0
                //r00 = 0r0r 0r0r 0r0r 0r0r
                r00 = _mm_add_epi16(y00r0, rv00);
                r01 = _mm_add_epi16(y01r0, rv01);
                g00 = _mm_add_epi16(_mm_add_epi16(y00r0, gu00), gv00);
                g01 = _mm_add_epi16(_mm_add_epi16(y01r0, gu01), gv01);
                b00 = _mm_add_epi16(y00r0, bu00);
                b01 = _mm_add_epi16(y01r0, bu01);
    
                //排列RGB数据
                r00 = _mm_packus_epi16(r00, r01);         // rrrr.. 组合计算
                g00 = _mm_packus_epi16(g00, g01);         // gggg.. 组合计算
                b00 = _mm_packus_epi16(b00, b01);         // bbbb.. 组合计算
    
                r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..//取低位
                gbgb = _mm_unpacklo_epi8(b00, g00);  // gbgb..
                rgb0123 = _mm_unpacklo_epi16(gbgb, r01);  // 0rgb0rgb..
                rgb4567 = _mm_unpackhi_epi16(gbgb, r01);  // 0rgb0rgb..   rgb0123 == rgb4567??
    
                r01 = _mm_unpackhi_epi8(r00, zero);            //取高位
                gbgb = _mm_unpackhi_epi8(b00, g00);
                rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
                rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
    
                //输出RGB数据 
                //rgb0123 = 0000rgb rgb rgb rgb 
                rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr0, rgb0123);
                //_mm_store_si128
                
                dstrgbr0 += 12;
    
                rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr0, rgb4567);
                dstrgbr0 += 12;
    
                rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr0, rgb89ab); dstrgbr0 += 12;
    
                rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
                memcpy(dstrgbr0, &rgbcdef, 12); dstrgbr0 += 12;
    
                //计算出最后RGB        行1
                r00 = _mm_add_epi16(y00r1, rv00);
                r01 = _mm_add_epi16(y01r1, rv01);
                g00 = _mm_add_epi16(_mm_add_epi16(y00r1, gu00), gv00);
                g01 = _mm_add_epi16(_mm_add_epi16(y01r1, gu01), gv01);
                b00 = _mm_add_epi16(y00r1, bu00);
                b01 = _mm_add_epi16(y01r1, bu01);
    
                r00 = _mm_packus_epi16(r00, r01);         // rrrr.. saturated
                g00 = _mm_packus_epi16(g00, g01);         // gggg.. saturated
                b00 = _mm_packus_epi16(b00, b01);         // bbbb.. saturated
    
                r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..
                gbgb = _mm_unpacklo_epi8(b00, g00);  // gbgb..
                rgb0123 = _mm_unpacklo_epi16(gbgb, r01);  // 0rgb0rgb..
                rgb4567 = _mm_unpackhi_epi16(gbgb, r01);  // 0rgb0rgb..
    
                r01 = _mm_unpackhi_epi8(r00, zero);
                gbgb = _mm_unpackhi_epi8(b00, g00);
                rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
                rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
    
                rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr1, rgb0123); dstrgbr1 += 12;
    
                rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr1, rgb4567); dstrgbr1 += 12;
    
                rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb);
                _mm_storeu_si128((__m128i *)dstrgbr1, rgb89ab); dstrgbr1 += 12;
    
                rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb);
                memcpy(dstrgbr1, &rgbcdef, 12); dstrgbr1 += 12;
            }
        }
    }
  • 相关阅读:
    eclipse中的项目如何打成war包
    【SVN】Please execute the 'Cleanup' command.
    2021.06.02模拟赛DP2
    2021.05.26模拟赛 DP1
    状压DP
    高斯消元
    矩阵快速幂
    2021.05.10讲题
    Luogu P2152[SDOI 2009]Super GCD
    Tarjan
  • 原文地址:https://www.cnblogs.com/luoyinjie/p/9935778.html
Copyright © 2011-2022 走看看