void yuv420_to_rgb24_sse3(uint8_t *yp, uint8_t *up, uint8_t *vp, int sy, int suv, int width, int height, uint8_t *rgb, int srgb) { //定义空间 __m128i y0r0, y0r1, u0, v0; __m128i y00r0, y01r0, y00r1, y01r1; __m128i u00, u01, v00, v01; __m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01; __m128i r00, r01, g00, g01, b00, b01; __m128i rgb0123, rgb4567, rgb89ab, rgbcdef; __m128i gbgb; __m128i ysub, uvsub; __m128i zero, facy, facrv, facgu, facgv, facbu; __m128i *srcy128r0, *srcy128r1; uint8_t *dstrgbr0, *dstrgbr1; __m128i maskrgb; __m64 *srcu64, *srcv64; //定义核,公式定量 //ysub = 0x0010 ...... 8 times ysub = _mm_set1_epi16(0x0010); uvsub = _mm_set1_epi16(0x0080); zero = _mm_set1_epi16(0x0000); maskrgb = _mm_set_epi8(128, 128, 128, 128, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); facy = _mm_set1_epi16(0x2543); facrv = _mm_set1_epi16(0x3313); facgu = _mm_set1_epi16(0xF377); facgv = _mm_set1_epi16(0xE5FC); facbu = _mm_set1_epi16(0x408D); for (int y = 0; y < height; y += 2) { //源数据指针 srcy128r0 = (__m128i *)(yp + sy*y); srcy128r1 = (__m128i *)(yp + sy*y + sy); srcu64 = (__m64 *)(up + suv*(y / 2)); srcv64 = (__m64 *)(vp + suv*(y / 2)); dstrgbr0 = rgb + srgb*y; dstrgbr1 = rgb + srgb*y + srgb; for (int x = 0; x < width; x += 16) { //加载行数据 u0 = _mm_loadl_epi64((__m128i *)srcu64); srcu64++; //0000 0000 0000 xxxx <= uuuu v0 = _mm_loadl_epi64((__m128i *)srcv64); srcv64++; y0r0 = _mm_load_si128(srcy128r0++);// xxxx xxxx xxxx xxxx <= yyyy yyyy yyyy yyyy y0r1 = _mm_load_si128(srcy128r1++); //计算YUV中的Y向量 // a = 0y0y 0y0y 0y0y 0y0y // b = a - 0x0010 .... = 0y-0x0010 ; // c = `b<<3 (8 times) // d = (c*0x2543)>>16 ......8times y00r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r0, zero), ysub), 3), facy); y01r0 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r0, zero), ysub), 3), facy); y00r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r1, zero), ysub), 3), facy); y01r1 = _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r1, zero), ysub), 3), facy); //展开u和v,使它们与y值对齐 // u0 = [0][u3] [0][u2] ...... //a = [0][u3][0][u3] ........[0][u0][0][u0] //b = [0][u3] - 0x0080 .... 8times //u00 = [0][u3] <<3 ... 8 times; //u00 =[00000u3000] //u01 = u00; u0 = _mm_unpacklo_epi8(u0, zero); u00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(u0, u0), uvsub), 3); u01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(u0, u0), uvsub), 3); v0 = _mm_unpacklo_epi8(v0, zero); v00 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpacklo_epi16(v0, v0), uvsub), 3); v01 = _mm_slli_epi16(_mm_sub_epi16(_mm_unpackhi_epi16(v0, v0), uvsub), 3); //计算两行UV的向量 // short 乘法 然后移位 // rv00 其实等于rv01 rv00 = _mm_mulhi_epi16(facrv, v00); rv01 = _mm_mulhi_epi16(facrv, v01); //([00000u3000]*0xF377)>>8 8 times overflow?? gu00 = _mm_mulhi_epi16(facgu, u00); gu01 = _mm_mulhi_epi16(facgu, u01); gv00 = _mm_mulhi_epi16(facgv, v00); gv01 = _mm_mulhi_epi16(facgv, v01); bu00 = _mm_mulhi_epi16(facbu, u00); bu01 = _mm_mulhi_epi16(facbu, u01); //计算出最后RGB 行0 //r00 = 0r0r 0r0r 0r0r 0r0r r00 = _mm_add_epi16(y00r0, rv00); r01 = _mm_add_epi16(y01r0, rv01); g00 = _mm_add_epi16(_mm_add_epi16(y00r0, gu00), gv00); g01 = _mm_add_epi16(_mm_add_epi16(y01r0, gu01), gv01); b00 = _mm_add_epi16(y00r0, bu00); b01 = _mm_add_epi16(y01r0, bu01); //排列RGB数据 r00 = _mm_packus_epi16(r00, r01); // rrrr.. 组合计算 g00 = _mm_packus_epi16(g00, g01); // gggg.. 组合计算 b00 = _mm_packus_epi16(b00, b01); // bbbb.. 组合计算 r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r..//取低位 gbgb = _mm_unpacklo_epi8(b00, g00); // gbgb.. rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb.. rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb.. rgb0123 == rgb4567?? r01 = _mm_unpackhi_epi8(r00, zero); //取高位 gbgb = _mm_unpackhi_epi8(b00, g00); rgb89ab = _mm_unpacklo_epi16(gbgb, r01); rgbcdef = _mm_unpackhi_epi16(gbgb, r01); //输出RGB数据 //rgb0123 = 0000rgb rgb rgb rgb rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr0, rgb0123); //_mm_store_si128 dstrgbr0 += 12; rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr0, rgb4567); dstrgbr0 += 12; rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr0, rgb89ab); dstrgbr0 += 12; rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb); memcpy(dstrgbr0, &rgbcdef, 12); dstrgbr0 += 12; //计算出最后RGB 行1 r00 = _mm_add_epi16(y00r1, rv00); r01 = _mm_add_epi16(y01r1, rv01); g00 = _mm_add_epi16(_mm_add_epi16(y00r1, gu00), gv00); g01 = _mm_add_epi16(_mm_add_epi16(y01r1, gu01), gv01); b00 = _mm_add_epi16(y00r1, bu00); b01 = _mm_add_epi16(y01r1, bu01); r00 = _mm_packus_epi16(r00, r01); // rrrr.. saturated g00 = _mm_packus_epi16(g00, g01); // gggg.. saturated b00 = _mm_packus_epi16(b00, b01); // bbbb.. saturated r01 = _mm_unpacklo_epi8(r00, zero); // 0r0r.. gbgb = _mm_unpacklo_epi8(b00, g00); // gbgb.. rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb.. rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb.. r01 = _mm_unpackhi_epi8(r00, zero); gbgb = _mm_unpackhi_epi8(b00, g00); rgb89ab = _mm_unpacklo_epi16(gbgb, r01); rgbcdef = _mm_unpackhi_epi16(gbgb, r01); rgb0123 = _mm_shuffle_epi8(rgb0123, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr1, rgb0123); dstrgbr1 += 12; rgb4567 = _mm_shuffle_epi8(rgb4567, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr1, rgb4567); dstrgbr1 += 12; rgb89ab = _mm_shuffle_epi8(rgb89ab, maskrgb); _mm_storeu_si128((__m128i *)dstrgbr1, rgb89ab); dstrgbr1 += 12; rgbcdef = _mm_shuffle_epi8(rgbcdef, maskrgb); memcpy(dstrgbr1, &rgbcdef, 12); dstrgbr1 += 12; } } }