zoukankan      html  css  js  c++  java
  • SIMD函数整理:01 《PC平台新技术MMX(上册):开发编程指南》第8章 MMX编码技术

    一、来源

      来源:《PC平台新技术MMX(上册):开发编程指南》第8章 MMX编码技术

      书籍信息——
    http://e.360buy.com/30027396.html
    PC平台新技术MMX(上册):开发编程指南
    作 者: 吴乐南 编
    出 版 社: 东南大学出版社
    ISBN:9787810502528
    出版时间:1997-10-01
    页 数:149
    字 数:237000
    所属分类:
    电子书 > 计算机与互联网 > 编程语言与程序设计
    电子书 > 计算机与互联网 > 计算机工具书


    二、整理后的代码

      代码——

    #include <Windows.h>
    #include <stdlib.h>
    #include <stdio.h>
    #include <time.h>
    #include <conio.h>
    #include <assert.h>
    
    // MMX, SSE, SSE2
    #include <emmintrin.h>
    
    // 紧缩无符号字 解包为 两组紧缩无符号双字
    // 章节:8.1 数据拆封/8.1.1 无符号数拆封
    //
    // result: 两个零扩展的32位双字,来自源的两个低端字。
    // mm1_dst_hi: 两个零扩展的32位双字,来自源的两个高端字。
    // mm0_src: 源值(紧缩16位无符号数)。
    inline __m64 md_unpack_mud4muw(__m64& mm1_dst_hi, const __m64 mm0_src)
    {
        __m64 muwZero = _mm_setzero_si64();    // [MMX]赋值为0
        mm1_dst_hi = _mm_unpackhi_pi16(mm0_src, muwZero);    // 把两个高端字拆封到两个32位双字中。[MMX]高位解包.字到双字
        return       _mm_unpacklo_pi16(mm0_src, muwZero);    // 把两个低端字拆封到两个32位双字中。[MMX]低位解包.字到双字
    }
    
    // 紧缩带符号字 解包为 两组紧缩带符号双字
    // 章节:8.1 数据拆封/8.1.2 带符号数拆封
    //
    // result: 两个符号扩展的32位双字,来自源的两个低端字。
    // mm1_dst_hi: 两个符号扩展的32位双字,来自源的两个高端字。
    // mm0_src: 源值(紧缩16位带符号数)。
    inline __m64 md_unpack_mid4miw(__m64& mm1_dst_hi, const __m64 mm0_src)
    {
        // 注:其实并不需要读取mm1_dst_hi,但为了符合语法,只能这样写。
        mm1_dst_hi = _mm_srai_pi32(_mm_unpackhi_pi16(mm1_dst_hi, mm0_src), 16); // 把源数据的两个高端字拆分到 第1字与第3字(即两个紧缩双字的高16位),再紧缩双字算术右移16位。使源数据的两个高端字扩展为2个32位带符号双字。
        return       _mm_srai_pi32(_mm_unpacklo_pi16(mm0_src,    mm0_src), 16); // 把源数据的两个低端字拆分到 第1字与第3字(即两个紧缩双字的高16位),再紧缩双字算术右移16位。使源数据的两个低端字扩展为2个32位带符号双字。
    }
    
    // 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
    // 章节:8.2 数据紧缩/8.2.1 带饱和的交叉紧缩
    // 例如:将 {[B1,B0], [A1,A0]} 交叉紧缩为 {[B1',A1',B0',A0']}
    // 注:紧缩(_mm_packs_pi32)是将 {[B1,B0], [A1,A0]} 转为 {[B1',B0',A1',A0']}
    //
    // result: 紧缩16位带符号数。第0字和第2字来自mm0_lo的带符号饱和双字,第1字和第3字来自mm1_hi的带符号饱和双字。
    // mm0_lo: 低位源值(A)。
    // mm1_hi: 高位源值(B)。
    inline __m64 md_pack_s_cross_miw4mid(__m64 mm0_lo, __m64 mm1_hi)
    {
        mm1_hi = _mm_packs_pi32(mm1_hi, mm1_hi);    // 紧缩并且符号饱和。即变为[B1',B0',B1',B0']。[MMX]饱和打包.双字到字
        mm0_lo = _mm_packs_pi32(mm0_lo, mm0_lo);    // 紧缩并且符号饱和。即变为[A1',A0',A1',A0']。
        return _mm_unpacklo_pi16(mm0_lo, mm1_hi);    // 交叉操作数的低16位。[MMX]低位解包.字到双字
    }
    
    // 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
    // 章节:8.2 数据紧缩/8.2.2 不带饱和的交叉紧缩
    // 例如:将 {[B1,B0], [A1,A0]} 交叉紧缩为 {[B1',A1',B0',A0']}
    //
    // result: 紧缩16位无符号数。第0字和第2字来自mm0_lo的无符号双字,第1字和第3字来自mm1_hi的无符号双字。
    // mm0_lo: 低位源值(A)。
    // mm1_hi: 高位源值(B)。
    inline __m64 md_pack_w_cross_muw4mud(__m64 mm0_lo, __m64 mm1_hi)
    {
        mm1_hi = _mm_slli_pi32(mm1_hi, 16);    // 将每个双字的低16位左移至高16位
        mm0_lo = _mm_and_si64(mm0_lo, _mm_set_pi16(0, (short)0xFFFF, 0, (short)0xFFFF));    // 用0屏蔽每个双字的最高16位
        return _mm_or_si64(mm0_lo, mm1_hi);    // 合并两个操作数
    }
    
    // 2x2矩阵转置.紧缩双字
    // 章节:8.3 非交叉拆分
    // 例如:将2x2矩阵 [[A1,A0] [B1,B0]] 转置为 [[B0,A0] [B1,A1]]。
    // 
    // [A1 A0]    [B0 A0]
    // [B1 B0] -> [B1 A1]
    // msb<-lsb
    //
    // mm0_row0: 2x2矩阵的第0行(A)。
    // mm1_row1: 2x2矩阵的第1行(B)。
    inline void md_matrix_transpose_2x2_mmd(__m64& mm0_row0, __m64& mm1_row1)
    {
        __m64 tmp = mm0_row0;    // 备份第0行
        mm0_row0 = _mm_unpacklo_pi32(mm0_row0, mm1_row1);    // 高32位为mm1_row1的低32位(B0),低32位为源mm0_row0的低32位(A0)。[MMX]低位解包.双字到四字
        mm1_row1 = _mm_unpackhi_pi32(tmp     , mm1_row1);    // 高32位为mm1_row1的高32位(B1),低32位为源mm0_row0的高32位(A1)。[MMX]高位解包.双字到四字
    }
    
    // 复数与常量相乘(紧缩字->紧缩双字)
    // 章节:8.4 复数与常量相乘
    //
    // result: 复数乘法的结果,高32位是实部,低32位是虚部。
    // mm0_src: 被乘数([?,?,Dr,Di])。
    // mm1_c: 已调整好顺序的常量乘数([Cr,-Ci,Ci,Cr])。
    inline __m64 md_complex_mul_c_mid4miw(__m64 mm0_src, const __m64 mm1_c)
    {
        mm0_src = _mm_unpacklo_pi32(mm0_src, mm0_src);    // 产生 [Dr,Di,Dr,Di]。[MMX]低位解包.双字到四字
        return _mm_madd_pi16(mm0_src, mm1_c);    // 操作结果是 [(Dr*Cr-Di*Ci), (Dr*Ci+Di*Cr)]。[MMX]乘后二加.带符号16位至带符号32位
    }
    
    // 无符号紧缩字节的绝对差
    // 章节:8.5 数的绝对差\8.5.1 无符号数的绝对差
    //
    // result: 无符号紧缩字节的绝对差。伪代码——result[i]=abs(mm0[i] - mm1[i])。
    // mm0: 源操作数A。
    // mm1: 源操作数B。
    inline __m64 md_absolute_deviation_mub(const __m64 mm0, const __m64 mm1)
    {
        return _mm_or_si64(_mm_subs_pu8(mm0, mm1), _mm_subs_pu8(mm1, mm0));
            // 1. "_mm_subs_pu8(mm0, mm1)": 计算差值
            // 2. "_mm_subs_pu8(mm1, mm0)": 以另一种途径计算差值
            // 3. "_mm_or_si64(...,  ...)": 合并结果
    }
    
    // 带符号紧缩字的绝对差
    // 章节:8.5 数的绝对差\8.5.2 带符号数的绝对差
    //
    // result: 带符号紧缩字的绝对差。伪代码——result[i]=abs(mm0[i] - mm1[i])。
    // mm0: 源操作数A。
    // mm1: 源操作数B。
    inline __m64 md_absolute_deviation_miw(const __m64 mm0, const __m64 mm1)
    {
        __m64 miwMaskGt = _mm_cmpgt_pi16(mm0, mm1);    // 产生 A>B 的屏蔽值
        __m64 miwXor = _mm_and_si64(_mm_xor_si64(mm0, mm1), miwMaskGt);    // 产生交换屏蔽值(仅在A>B时的XOR(A,B)值)。即当A>B时,该字是XOR(A,B);而A<=B时,该字是是0。
        __m64 miwMin = _mm_xor_si64(mm0, miwXor);    // 当A>B时就用xor交换,产生最小值
        __m64 miwMax = _mm_xor_si64(mm1, miwXor);    // 当B<=A时就用xor交换,产生最大值
        return _mm_sub_pi16(miwMax, miwMin);    // 绝对差 = 最大值 - 最小值
    }
    
    // 带符号紧缩字的绝对值
    // 章节:8.6 绝对值
    //
    // result: 带符号紧缩字的绝对值。伪代码——result[i]=abs(mm0[i])。
    // mm0: 源操作数。
    inline __m64 md_abs_miw(const __m64 mm0)
    {
        __m64 miwSign = _mm_srai_pi16(mm0, 15);    // 将符号位转为掩码。使每个字为全0(对于非负数)或全1(对于负数)。注:补码下的“全1”代表数值“-1”,减法碰到“-1”就形成了“加一”。
        return _mm_subs_pi16(_mm_xor_si64(mm0, miwSign), miwSign);    // 为了获得绝对值,仅对负数求相反数。补码求相反数规则——原码取反再加一。
    }
    
    // 将带符号紧缩字限制在[iLow,iHigh]区间
    // 章节:8.7 数值的截取/8.7.1 对任意有符号数范围截取符号数/[0]
    //
    // result: 限制后的带符号紧缩字。伪代码——result[i]=(mm0[i]<iLow)?iLow:( (mm0[i]>iHigh)?iHigh:mm0[i] )。
    // mm0: 源操作数。
    inline __m64 md_clamp_miw(const __m64 mm0, short iLow, short iHigh)
    {
        const __m64 miwMinInt16 = _mm_set1_pi16((short)0x8000);    // 带符号16位的最小值
        __m64 tmp = _mm_add_pi16(mm0, miwMinInt16);    // 利用环绕加法,将带符号数 偏移至 无符号数的空间。
        tmp = _mm_adds_pu16(tmp, _mm_set1_pi16( (short)(0xFFFF-(iHigh+0x8000)) ));    // 限制最高值
        tmp = _mm_subs_pu16(tmp, _mm_set1_pi16( (short)(0xFFFF-(iHigh+0x8000)+(iLow+0x8000)) ));    // 限制最低值
        return _mm_add_pi16(tmp, _mm_set1_pi16( iLow ));    // 恢复偏移
    }
    
    // 将无符号紧缩字限制在[uLow,uHigh]区间
    // 章节:8.7 数值的截取/8.7.2 对任意有符号数范围截取符号数
    //
    // result: 限制后的带符号紧缩字。伪代码——result[i]=(mm0[i]<uLow)?uLow:( (mm0[i]>uHigh)?uHigh:mm0[i] )。
    // mm0: 源操作数。
    inline __m64 md_clamp_muw(const __m64 mm0, unsigned short uLow, unsigned short uHigh)
    {
        __m64 tmp = _mm_adds_pu16(mm0, _mm_set1_pi16( (short)(0xFFFFU-uHigh) ));    // 限制最高值
        tmp       = _mm_subs_pu16(tmp, _mm_set1_pi16( (short)(0xFFFFU-uHigh+uLow) ));    // 限制最低值
        return _mm_add_pi16(tmp, _mm_set1_pi16( uLow ));    // 恢复偏移
    }
    
    // 返回常数:0
    // 章节:8.8 生成常量/[0]在MM0产生0寄存器
    inline __m64 md_setzero_mmq()
    {
        __m64 tmp=_mm_setzero_si64();    // 其实并不需要赋值,但为了符合语法,只能这样写。
        return _mm_xor_si64(tmp, tmp);
        // 其实Intrinsics函数中有这样的函数——
        // return _mm_setzero_si64();
    }
    
    // 返回常数:全1
    // 章节:8.8 生成常量/[1]在寄存器MM1中置全1,它在每一个紧缩数据类型的值域中都是-1
    inline __m64 md_setfull_mmq()
    {
        __m64 tmp=_mm_setzero_si64();    // 其实并不需要赋值,但为了符合语法,只能这样写。
        return _mm_cmpeq_pi8(tmp, tmp);
    }
    
    // 返回常数:每个紧缩字节为1
    // 章节:8.8 生成常量/[2]在每一个紧缩字节[或紧缩字](或紧缩双字)的值域中产生常数1
    inline __m64 md_set_1_mib()
    {
        __m64 mibZero = _mm_setzero_si64();
        __m64 mibNegativeOne = _mm_cmpeq_pi8(mibZero, mibZero);
        return _mm_sub_pi8(mibZero, mibNegativeOne);
    }
    
    // 返回常数:每个紧缩字为pow(2,n)-1
    // 章节:8.8 生成常量/[3]在每一个紧缩字(或紧缩双字)的值域中产生带符号常数pow(2,n)-1
    inline __m64 md_set_pow2n_sub1_miw(int n)
    {
        assert((n>=1) && (n<=16));
        __m64 mibZero = _mm_setzero_si64();
        __m64 mibFull = _mm_cmpeq_pi8(mibZero, mibZero);
        return _mm_srli_pi16(mibFull, 16-n);
    }
    
    // 返回常数:每个紧缩字为-pow(2,n)
    // 章节:8.8 生成常量/[4]在每一个紧缩字(或紧缩双字)的值域中产生带符号常数-pow(2,n)
    inline __m64 md_set_neg_pow2n_miw(int n)
    {
        assert((n>=0) && (n<=15));
        __m64 mibZero = _mm_setzero_si64();
        __m64 mibFull = _mm_cmpeq_pi8(mibZero, mibZero);
        return _mm_slli_pi16(mibFull, n);
    }
    
    // 验证
    void doTest(int cnt)
    {
        __m64 t0,t1,t2;
        int i;
    
        // 紧缩无符号字 解包为 两组紧缩无符号双字
        printf("md_unpack_mud4muw:\n");
        t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
        printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t2 = md_unpack_mud4muw(t1, t0);
        }
        printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        printf("\n");
    
        // 紧缩带符号字 解包为 两组紧缩带符号双字
        printf("md_unpack_mid4miw:\n");
        t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
        printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t2 = md_unpack_mid4miw(t1, t0);
        }
        printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        printf("\n");
    
        // 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
        printf("md_pack_s_cross_miw4mid:\n");
        t1 = _mm_set_pi32(0x00001111, 0x000F2222);
        t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t0 = md_pack_s_cross_miw4mid(t2, t1);
        }
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
        printf("\n");
    
        // 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
        printf("md_pack_w_cross_muw4mud:\n");
        t1 = _mm_set_pi32(0x00001111, 0x000F2222);
        t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t0 = md_pack_w_cross_muw4mud(t2, t1);
        }
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
        printf("\n");
    
        // 2x2矩阵转置.紧缩双字
        printf("md_matrix_transpose_2x2_mmd:\n");
        t1 = _mm_set_pi32(0x00001111, 0x000F2222);
        t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            md_matrix_transpose_2x2_mmd(t1, t2);
        }
        printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        printf("\n");
    
        // 复数与常量相乘(紧缩字->紧缩双字)
        printf("md_complex_mul_c_mid4miw:\n");
        t1 = _mm_set_pi16(0,0, 1, 1);    // 1+i
        t2 = _mm_set_pi16(3,-2, 2,3);    // 3+2i.    (1+i)*(3+2i) = 1+5i
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t0 = md_complex_mul_c_mid4miw(t1, t2);
        }
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
        printf("\n");
    
        // 无符号紧缩字节的绝对差
        printf("md_absolute_deviation_mub:\n");
        t1 = _mm_set_pi8(1,2,3,4,5,6,7,8);
        t2 = _mm_set_pi8(8,7,6,5,4,3,2,1);
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t0 = md_absolute_deviation_mub(t1, t2);
        }
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
        printf("\n");
    
        // 带符号紧缩字的绝对差
        printf("md_absolute_deviation_miw:\n");
        t1 = _mm_set_pi16(-1, 1, 3, 5);
        t2 = _mm_set_pi16( 2, 2, 2, 2);
        printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t0 = md_absolute_deviation_miw(t1, t2);
        }
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
        printf("\n");
    
        // 带符号紧缩字的绝对值
        printf("md_abs_miw4miw:\n");
        t0 = _mm_set_pi16(-1, 1, 3, -5);
        printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t1 = md_abs_miw(t0);
        }
        printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
        printf("\n");
    
        // 将带符号紧缩字限制在[iLow,iHigh]区间
        printf("md_clamp_miw:\n");
        t0 = _mm_set_pi16(-15, 1, 254, 257);
        printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t1 = md_clamp_miw(t0, -1, 255);
        }
        printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
        printf("\n");
    
        // 将无符号紧缩字限制在[uLow,uHigh]区间
        printf("md_clamp_muw:\n");
        t0 = _mm_set_pi16(1, 254, 257, 32769U);
        printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
        for(i=0; i<cnt; ++i)
        {
            t1 = md_clamp_muw(t0, 16, 255);
        }
        printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
        printf("\n");
    
        // 返回常数:0
        printf("md_setzero_mmq:\t");
        t0 = md_setzero_mmq();
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        // 返回常数:全1
        printf("md_setfull_mmq:\t");
        t0 = md_setfull_mmq();
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        // 返回常数:每个紧缩字节为1
        printf("md_set_1_mib:\t");
        t0 = md_set_1_mib();
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        // 返回常数:每个紧缩字为pow(2,n)-1
        printf("md_set_pow2n_sub1_miw:\t");
        t0 = md_set_pow2n_sub1_miw(8);
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        // 返回常数:每个紧缩字为pow(2,n)-1
        printf("md_set_neg_pow2n_miw:\t");
        t0 = md_set_neg_pow2n_miw(15);
        printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
    }
    
    int main(int argc, char* argv[])
    {
        doTest((rand()&1) + 1);    // 用一个随机数作为循环次数,避免编译器优化循环
        return 0;
    }


    三、编译器生成的汇编代码

      VC6编译器生成的汇编代码——

    ; Listing generated by Microsoft (R) Optimizing Compiler Version 12.00.9044.0 
    
        TITLE    E:\zylKanbox\Doc\Program\ASM\x86\SIMD\my\md\md01_mmxguide_ch08\md01_mmxguide_ch08.cpp
        .386P
    include listing.inc
    if @Version gt 510
    .model FLAT
    else
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    CONST    SEGMENT DWORD USE32 PUBLIC 'CONST'
    CONST    ENDS
    _BSS    SEGMENT DWORD USE32 PUBLIC 'BSS'
    _BSS    ENDS
    $$SYMBOLS    SEGMENT BYTE USE32 'DEBSYM'
    $$SYMBOLS    ENDS
    _TLS    SEGMENT DWORD USE32 PUBLIC 'TLS'
    _TLS    ENDS
    ;    COMDAT ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_01BJG@?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@
    _DATA    SEGMENT DWORD USE32 PUBLIC 'DATA'
    _DATA    ENDS
    ;    COMDAT ??8@YAHABU_GUID@@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT __mm_cvtpi16_ps
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT __mm_cvtpu16_ps
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT __mm_cvtps_pi16
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_unpack_mud4muw@@YA?AT__m64@@AAT1@T1@@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_unpack_mid4miw@@YA?AT__m64@@AAT1@T1@@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_pack_s_cross_miw4mid@@YA?AT__m64@@T1@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_pack_w_cross_muw4mud@@YA?AT__m64@@T1@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_matrix_transpose_2x2_mmd@@YAXAAT__m64@@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_complex_mul_c_mid4miw@@YA?AT__m64@@T1@T1@@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_absolute_deviation_mub@@YA?AT__m64@@T1@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_absolute_deviation_miw@@YA?AT__m64@@T1@0@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_abs_miw@@YA?AT__m64@@T1@@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_clamp_miw@@YA?AT__m64@@T1@FF@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_clamp_muw@@YA?AT__m64@@T1@GG@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_setzero_mmq@@YA?AT__m64@@XZ
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_setfull_mmq@@YA?AT__m64@@XZ
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_set_1_mib@@YA?AT__m64@@XZ
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_set_pow2n_sub1_miw@@YA?AT__m64@@H@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?md_set_neg_pow2n_miw@@YA?AT__m64@@H@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT ?doTest@@YAXH@Z
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    ;    COMDAT _main
    _TEXT    SEGMENT PARA USE32 PUBLIC 'CODE'
    _TEXT    ENDS
    FLAT    GROUP _DATA, CONST, _BSS
        ASSUME    CS: FLAT, DS: FLAT, SS: FLAT
    endif
    
    INCLUDELIB LIBC
    INCLUDELIB OLDNAMES
    
    PUBLIC    ?doTest@@YAXH@Z                    ; doTest
    PUBLIC    ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@    ; `string'
    PUBLIC    ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    PUBLIC    ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
    PUBLIC    ??_C@_01BJG@?6?$AA@                ; `string'
    PUBLIC    ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@    ; `string'
    PUBLIC    ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@    ; `string'
    PUBLIC    ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
    PUBLIC    ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@    ; `string'
    PUBLIC    ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@    ; `string'
    PUBLIC    ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ ; `string'
    PUBLIC    ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ ; `string'
    PUBLIC    ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ ; `string'
    PUBLIC    ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ ; `string'
    PUBLIC    ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@        ; `string'
    PUBLIC    ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@        ; `string'
    PUBLIC    ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@        ; `string'
    PUBLIC    ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@        ; `string'
    PUBLIC    ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@        ; `string'
    PUBLIC    ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@        ; `string'
    PUBLIC    ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@    ; `string'
    PUBLIC    ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@    ; `string'
    EXTRN    _printf:NEAR
    ;    COMDAT ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@
    ; File E:\zylKanbox\Doc\Program\ASM\x86\SIMD\my\md\md01_mmxguide_ch08\md01_mmxguide_ch08.cpp
    _DATA    SEGMENT
    ??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@ DB 'md_unpack_mud4muw:', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
    _DATA    SEGMENT
    ??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ DB '[%.8X%.8X] -> ', 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ DB '['
        DB    '%.8X%.8X],[%.8X%.8X]', 0aH, 00H        ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_01BJG@?6?$AA@
    _DATA    SEGMENT
    ??_C@_01BJG@?6?$AA@ DB 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@ DB 'md_unpack_mid4miw:', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@ DB 'md_pack_s_cross_miw4m'
        DB    'id:', 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@
    _DATA    SEGMENT
    ??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ DB '['
        DB    '%.8X%.8X],[%.8X%.8X] -> ', 00H        ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@
    _DATA    SEGMENT
    ??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ DB '[%.8X%.8X]', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@ DB 'md_pack_w_cross_muw4m'
        DB    'ud:', 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ DB 'md_matrix_transpo'
        DB    'se_2x2_mmd:', 0aH, 00H            ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ DB 'md_complex_mul_c_mid'
        DB    '4miw:', 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ DB 'md_absolute_deviati'
        DB    'on_mub:', 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ DB 'md_absolute_deviati'
        DB    'on_miw:', 0aH, 00H                ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@ DB 'md_abs_miw4miw:', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@ DB 'md_clamp_miw:', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@
    _DATA    SEGMENT
    ??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@ DB 'md_clamp_muw:', 0aH, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@
    _DATA    SEGMENT
    ??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@ DB 'md_setzero_mmq:', 09H, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@
    _DATA    SEGMENT
    ??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@ DB 'md_setfull_mmq:', 09H, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@
    _DATA    SEGMENT
    ??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@ DB 'md_set_1_mib:', 09H, 00H ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@
    _DATA    SEGMENT
    ??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@ DB 'md_set_pow2n_sub1_miw:', 09H
        DB    00H                        ; `string'
    _DATA    ENDS
    ;    COMDAT ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@
    _DATA    SEGMENT
    ??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@ DB 'md_set_neg_pow2n_miw:', 09H
        DB    00H                        ; `string'
    ; Function compile flags: /Ogty
    _DATA    ENDS
    ;    COMDAT ?doTest@@YAXH@Z
    _TEXT    SEGMENT
    _cnt$ = 8
    _t0$ = -40
    _t1$ = -32
    _t2$ = -24
    ?doTest@@YAXH@Z PROC NEAR                ; doTest, COMDAT
    
    ; 232  : {
    
        push    ebp
        mov    ebp, esp
        and    esp, -8                    ; fffffff8H
        sub    esp, 40                    ; 00000028H
        push    esi
        push    edi
    
    ; 233  :     __m64 t0,t1,t2;
    ; 234  :     int i;
    ; 235  : 
    ; 236  :     // 紧缩无符号字 解包为 两组紧缩无符号双字
    ; 237  :     printf("md_unpack_mud4muw:\n");
    
        push    OFFSET FLAT:??_C@_0BE@EFJN@md_unpack_mud4muw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 238  :     t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
    
        mov    DWORD PTR -24+[esp+52], -1985229329    ; 89abcdefH
        mov    DWORD PTR -24+[esp+56], 19088743    ; 01234567H
        movq    mm0, MMWORD PTR -24+[esp+52]
        movq    MMWORD PTR -8+[esp+52], mm0
        movq    MMWORD PTR _t0$[esp+52], mm0
    
    ; 239  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+52]
        mov    ecx, DWORD PTR _t0$[esp+56]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
    
    ; 240  :     for(i=0; i<cnt; ++i)
    
        mov    esi, DWORD PTR _cnt$[ebp]
        xor    edi, edi
        add    esp, 16                    ; 00000010H
        cmp    esi, edi
        jle    SHORT $L43808
    
    ; 241  :     {
    ; 242  :         t2 = md_unpack_mud4muw(t1, t0);
    
        movq    mm1, MMWORD PTR _t0$[esp+48]
        pxor    mm0, mm0
        movq    mm2, mm0
        movq    mm3, mm1
        punpckhwd mm3, mm2
        movq    MMWORD PTR _t1$[esp+48], mm3
        punpcklwd mm1, mm0
        movq    MMWORD PTR _t2$[esp+48], mm1
    $L43808:
    
    ; 243  :     }
    ; 244  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    edx, DWORD PTR _t2$[esp+48]
        mov    eax, DWORD PTR _t2$[esp+52]
        mov    ecx, DWORD PTR _t1$[esp+48]
        push    edx
        mov    edx, DWORD PTR _t1$[esp+56]
        push    eax
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 245  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 246  : 
    ; 247  :     // 紧缩带符号字 解包为 两组紧缩带符号双字
    ; 248  :     printf("md_unpack_mid4miw:\n");
    
        push    OFFSET FLAT:??_C@_0BE@EMED@md_unpack_mid4miw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 249  :     t0 = _mm_set_pi32(0x01234567, 0x89ABCDEF);
    
        movq    mm0, MMWORD PTR -8+[esp+76]
        movq    MMWORD PTR _t0$[esp+76], mm0
    
    ; 250  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+76]
        mov    ecx, DWORD PTR _t0$[esp+80]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 40                    ; 00000028H
    
    ; 251  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43818
    
    ; 252  :     {
    ; 253  :         t2 = md_unpack_mid4miw(t1, t0);
    
        movq    mm0, MMWORD PTR _t0$[esp+48]
        movq    mm1, mm0
        movq    mm2, mm0
        mov    eax, esi
        punpcklwd mm2, mm1
        psrad    mm2, 16                    ; 00000010H
        movq    MMWORD PTR _t2$[esp+48], mm2
    $L43816:
        movq    mm1, mm0
        dec    eax
        movq    mm2, MMWORD PTR _t1$[esp+48]
        punpckhwd mm2, mm1
        psrad    mm2, 16                    ; 00000010H
        movq    MMWORD PTR _t1$[esp+48], mm2
        jne    SHORT $L43816
    $L43818:
    
    ; 254  :     }
    ; 255  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    edx, DWORD PTR _t2$[esp+48]
        mov    eax, DWORD PTR _t2$[esp+52]
        mov    ecx, DWORD PTR _t1$[esp+48]
        push    edx
        mov    edx, DWORD PTR _t1$[esp+56]
        push    eax
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 256  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 257  : 
    ; 258  :     // 两组紧缩带符号双字 交叉饱和紧缩为 紧缩带符号字
    ; 259  :     printf("md_pack_s_cross_miw4mid:\n");
    
        push    OFFSET FLAT:??_C@_0BK@MCMC@md_pack_s_cross_miw4mid?3?6?$AA@ ; `string'
        call    _printf
    
    ; 260  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    
        mov    DWORD PTR -24+[esp+76], 991778        ; 000f2222H
        mov    DWORD PTR -24+[esp+80], 4369        ; 00001111H
        movq    mm0, MMWORD PTR -24+[esp+76]
    
    ; 261  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    
        mov    DWORD PTR -24+[esp+76], -8739        ; ffffddddH
        movq    MMWORD PTR _t1$[esp+76], mm0
        mov    DWORD PTR -24+[esp+80], -13108        ; ffffccccH
    
    ; 262  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    edx, DWORD PTR _t1$[esp+76]
        movq    MMWORD PTR -16+[esp+76], mm0
        movq    mm0, MMWORD PTR -24+[esp+76]
        movq    MMWORD PTR -8+[esp+76], mm0
        movq    MMWORD PTR _t2$[esp+76], mm0
        mov    eax, DWORD PTR _t2$[esp+76]
        mov    ecx, DWORD PTR _t2$[esp+80]
        push    eax
        mov    eax, DWORD PTR _t1$[esp+84]
        push    ecx
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 48                    ; 00000030H
    
    ; 263  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43824
    
    ; 264  :     {
    ; 265  :         t0 = md_pack_s_cross_miw4mid(t2, t1);
    
        movq    mm0, MMWORD PTR _t1$[esp+48]
        movq    mm1, mm0
        packssdw mm0, mm1
        movq    mm1, mm0
        movq    mm0, MMWORD PTR _t2$[esp+48]
        movq    mm2, mm0
        packssdw mm0, mm2
        punpcklwd mm0, mm1
        movq    MMWORD PTR _t0$[esp+48], mm0
    $L43824:
    
    ; 266  :     }
    ; 267  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t0$[esp+48]
        mov    edx, DWORD PTR _t0$[esp+52]
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 268  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 269  : 
    ; 270  :     // 两组紧缩无符号双字 交叉环绕紧缩为 紧缩无符号字
    ; 271  :     printf("md_pack_w_cross_muw4mud:\n");
    
        push    OFFSET FLAT:??_C@_0BK@BLJL@md_pack_w_cross_muw4mud?3?6?$AA@ ; `string'
        call    _printf
    
    ; 272  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    
        movq    mm0, MMWORD PTR -16+[esp+68]
        movq    MMWORD PTR _t1$[esp+68], mm0
    
    ; 273  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    
        movq    mm0, MMWORD PTR -8+[esp+68]
    
    ; 274  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    edx, DWORD PTR _t1$[esp+68]
        movq    MMWORD PTR _t2$[esp+68], mm0
        mov    eax, DWORD PTR _t2$[esp+68]
        mov    ecx, DWORD PTR _t2$[esp+72]
        push    eax
        mov    eax, DWORD PTR _t1$[esp+76]
        push    ecx
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 40                    ; 00000028H
    
    ; 275  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43832
    
    ; 276  :     {
    ; 277  :         t0 = md_pack_w_cross_muw4mud(t2, t1);
    
        movq    mm0, MMWORD PTR _t1$[esp+48]
        or    eax, -1
        pslld    mm0, 16                    ; 00000010H
        mov    WORD PTR -32+[esp+48], ax
        mov    WORD PTR -32+[esp+50], di
        mov    WORD PTR -32+[esp+52], ax
        mov    WORD PTR -32+[esp+54], di
        movq    mm1, MMWORD PTR -32+[esp+48]
        movq    mm2, MMWORD PTR _t2$[esp+48]
        pand    mm2, mm1
        por    mm2, mm0
        movq    MMWORD PTR _t0$[esp+48], mm2
    $L43832:
    
    ; 278  :     }
    ; 279  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t0$[esp+48]
        mov    edx, DWORD PTR _t0$[esp+52]
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 280  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 281  : 
    ; 282  :     // 2x2矩阵转置.紧缩双字
    ; 283  :     printf("md_matrix_transpose_2x2_mmd:\n");
    
        push    OFFSET FLAT:??_C@_0BO@GMLJ@md_matrix_transpose_2x2_mmd?3?6?$AA@ ; `string'
        call    _printf
    
    ; 284  :     t1 = _mm_set_pi32(0x00001111, 0x000F2222);
    
        movq    mm0, MMWORD PTR -16+[esp+68]
        movq    MMWORD PTR _t1$[esp+68], mm0
    
    ; 285  :     t2 = _mm_set_pi32(0xFFFFCCCC, 0xFFFFDDDD);
    
        movq    mm0, MMWORD PTR -8+[esp+68]
    
    ; 286  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    edx, DWORD PTR _t1$[esp+68]
        movq    MMWORD PTR _t2$[esp+68], mm0
        mov    eax, DWORD PTR _t2$[esp+68]
        mov    ecx, DWORD PTR _t2$[esp+72]
        push    eax
        mov    eax, DWORD PTR _t1$[esp+76]
        push    ecx
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 40                    ; 00000028H
    
    ; 287  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43841
    
    ; 278  :     }
    ; 279  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, esi
    $L43839:
    
    ; 288  :     {
    ; 289  :         md_matrix_transpose_2x2_mmd(t1, t2);
    
        movq    mm1, MMWORD PTR _t2$[esp+48]
        movq    mm0, MMWORD PTR _t1$[esp+48]
        movq    mm2, mm1
        dec    eax
        movq    mm3, mm0
        punpckldq mm3, mm2
        movq    MMWORD PTR _t1$[esp+48], mm3
        punpckhdq mm0, mm1
        movq    MMWORD PTR _t2$[esp+48], mm0
        jne    SHORT $L43839
    $L43841:
    
    ; 290  :     }
    ; 291  :     printf("[%.8X%.8X],[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t2$[esp+48]
        mov    edx, DWORD PTR _t2$[esp+52]
        mov    eax, DWORD PTR _t1$[esp+48]
        push    ecx
        mov    ecx, DWORD PTR _t1$[esp+56]
        push    edx
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0BH@CPNE@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 292  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 293  : 
    ; 294  :     // 复数与常量相乘(紧缩字->紧缩双字)
    ; 295  :     printf("md_complex_mul_c_mid4miw:\n");
    
        push    OFFSET FLAT:??_C@_0BL@FJHD@md_complex_mul_c_mid4miw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 296  :     t1 = _mm_set_pi16(0,0, 1, 1);    // 1+i
    
        mov    eax, 1
        mov    WORD PTR -24+[esp+80], di
        mov    WORD PTR -24+[esp+76], ax
        mov    WORD PTR -24+[esp+78], ax
        mov    WORD PTR -24+[esp+82], di
    
    ; 297  :     t2 = _mm_set_pi16(3,-2, 2,3);    // 3+2i.    (1+i)*(3+2i) = 1+5i
    
        mov    eax, 3
        movq    mm0, MMWORD PTR -24+[esp+76]
        mov    WORD PTR -24+[esp+76], ax
        movq    MMWORD PTR _t1$[esp+76], mm0
        mov    WORD PTR -24+[esp+78], 2
        mov    WORD PTR -24+[esp+80], -2        ; fffffffeH
        mov    WORD PTR -24+[esp+82], ax
        movq    mm0, MMWORD PTR -24+[esp+76]
    
    ; 298  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t1$[esp+76]
        movq    MMWORD PTR _t2$[esp+76], mm0
        mov    edx, DWORD PTR _t2$[esp+76]
        mov    eax, DWORD PTR _t2$[esp+80]
        push    edx
        mov    edx, DWORD PTR _t1$[esp+84]
        push    eax
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 48                    ; 00000030H
    
    ; 299  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43845
    
    ; 300  :     {
    ; 301  :         t0 = md_complex_mul_c_mid4miw(t1, t2);
    
        movq    mm1, MMWORD PTR _t2$[esp+48]
        movq    mm0, MMWORD PTR _t1$[esp+48]
        movq    mm2, mm0
        punpckldq mm0, mm2
        pmaddwd    mm0, mm1
        movq    MMWORD PTR _t0$[esp+48], mm0
    $L43845:
    
    ; 302  :     }
    ; 303  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+48]
        mov    ecx, DWORD PTR _t0$[esp+52]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 304  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 305  : 
    ; 306  :     // 无符号紧缩字节的绝对差
    ; 307  :     printf("md_absolute_deviation_mub:\n");
    
        push    OFFSET FLAT:??_C@_0BM@HKHJ@md_absolute_deviation_mub?3?6?$AA@ ; `string'
        call    _printf
    
    ; 308  :     t1 = _mm_set_pi8(1,2,3,4,5,6,7,8);
    
        mov    al, 5
        mov    cl, 3
        mov    dl, 2
        mov    BYTE PTR -24+[esp+68], 8
        mov    BYTE PTR -24+[esp+69], 7
        mov    BYTE PTR -24+[esp+70], 6
        mov    BYTE PTR -24+[esp+71], al
        mov    BYTE PTR -24+[esp+72], 4
        mov    BYTE PTR -24+[esp+73], cl
        mov    BYTE PTR -24+[esp+74], dl
        mov    BYTE PTR -24+[esp+75], 1
        movq    mm0, MMWORD PTR -24+[esp+68]
    
    ; 309  :     t2 = _mm_set_pi8(8,7,6,5,4,3,2,1);
    
        mov    BYTE PTR -24+[esp+68], 1
        movq    MMWORD PTR _t1$[esp+68], mm0
        mov    BYTE PTR -24+[esp+69], dl
        mov    BYTE PTR -24+[esp+70], cl
        mov    BYTE PTR -24+[esp+71], 4
        mov    BYTE PTR -24+[esp+72], al
        mov    BYTE PTR -24+[esp+73], 6
        mov    BYTE PTR -24+[esp+74], 7
        mov    BYTE PTR -24+[esp+75], 8
        movq    mm0, MMWORD PTR -24+[esp+68]
    
    ; 310  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t1$[esp+68]
        movq    MMWORD PTR _t2$[esp+68], mm0
        mov    edx, DWORD PTR _t2$[esp+68]
        mov    eax, DWORD PTR _t2$[esp+72]
        push    edx
        mov    edx, DWORD PTR _t1$[esp+76]
        push    eax
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 40                    ; 00000028H
    
    ; 311  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43852
    
    ; 312  :     {
    ; 313  :         t0 = md_absolute_deviation_mub(t1, t2);
    
        movq    mm0, MMWORD PTR _t1$[esp+48]
        movq    mm1, MMWORD PTR _t2$[esp+48]
        movq    mm2, mm0
        movq    mm3, mm1
        psubusb    mm3, mm2
        psubusb    mm0, mm1
        por    mm0, mm3
        movq    MMWORD PTR _t0$[esp+48], mm0
    $L43852:
    
    ; 314  :     }
    ; 315  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+48]
        mov    ecx, DWORD PTR _t0$[esp+52]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 316  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 317  : 
    ; 318  :     // 带符号紧缩字的绝对差
    ; 319  :     printf("md_absolute_deviation_miw:\n");
    
        push    OFFSET FLAT:??_C@_0BM@KLKG@md_absolute_deviation_miw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 320  :     t1 = _mm_set_pi16(-1, 1, 3, 5);
    
        mov    WORD PTR -24+[esp+68], 5
        mov    WORD PTR -24+[esp+70], 3
        mov    WORD PTR -24+[esp+72], 1
        mov    WORD PTR -24+[esp+74], -1
        movq    mm0, MMWORD PTR -24+[esp+68]
    
    ; 321  :     t2 = _mm_set_pi16( 2, 2, 2, 2);
    
        mov    eax, 2
        movq    MMWORD PTR _t1$[esp+68], mm0
        mov    WORD PTR -24+[esp+68], ax
        mov    WORD PTR -24+[esp+70], ax
        mov    WORD PTR -24+[esp+72], ax
        mov    WORD PTR -24+[esp+74], ax
    
    ; 322  :     printf("[%.8X%.8X],[%.8X%.8X] -> ", t1.m64_u32[1], t1.m64_u32[0], t2.m64_u32[1], t2.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t1$[esp+68]
        movq    mm0, MMWORD PTR -24+[esp+68]
        movq    MMWORD PTR _t2$[esp+68], mm0
        mov    edx, DWORD PTR _t2$[esp+68]
        mov    eax, DWORD PTR _t2$[esp+72]
        push    edx
        mov    edx, DWORD PTR _t1$[esp+76]
        push    eax
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0BK@OAF@?$FL?$CF?48X?$CF?48X?$FN?0?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 40                    ; 00000028H
    
    ; 323  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43859
    
    ; 324  :     {
    ; 325  :         t0 = md_absolute_deviation_miw(t1, t2);
    
        movq    mm2, MMWORD PTR _t2$[esp+48]
        movq    mm1, MMWORD PTR _t1$[esp+48]
        movq    mm0, mm2
        movq    mm3, mm1
        pcmpgtw    mm3, mm0
        movq    mm4, mm1
        pxor    mm4, mm0
        movq    mm0, mm4
        pand    mm0, mm3
        movq    mm3, mm0
        pxor    mm1, mm3
        pxor    mm2, mm0
        psubw    mm2, mm1
        movq    MMWORD PTR _t0$[esp+48], mm2
    $L43859:
    
    ; 326  :     }
    ; 327  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+48]
        mov    ecx, DWORD PTR _t0$[esp+52]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 328  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 329  : 
    ; 330  :     // 带符号紧缩字的绝对值
    ; 331  :     printf("md_abs_miw4miw:\n");
    
        push    OFFSET FLAT:??_C@_0BB@KOFH@md_abs_miw4miw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 332  :     t0 = _mm_set_pi16(-1, 1, 3, -5);
    
        mov    WORD PTR -24+[esp+68], -5        ; fffffffbH
        mov    WORD PTR -24+[esp+70], 3
        mov    WORD PTR -24+[esp+72], 1
        mov    WORD PTR -24+[esp+74], -1
        movq    mm0, MMWORD PTR -24+[esp+68]
        movq    MMWORD PTR _t0$[esp+68], mm0
    
    ; 333  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    edx, DWORD PTR _t0$[esp+68]
        mov    eax, DWORD PTR _t0$[esp+72]
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 32                    ; 00000020H
    
    ; 334  :     for(i=0; i<cnt; ++i)
    
        cmp    esi, edi
        jle    SHORT $L43865
    
    ; 335  :     {
    ; 336  :         t1 = md_abs_miw(t0);
    
        movq    mm1, MMWORD PTR _t0$[esp+48]
        movq    mm0, mm1
        psraw    mm0, 15                    ; 0000000fH
        movq    mm2, mm0
        pxor    mm1, mm0
        psubsw    mm1, mm2
        movq    MMWORD PTR _t1$[esp+48], mm1
    $L43865:
    
    ; 337  :     }
    ; 338  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t1$[esp+48]
        mov    edx, DWORD PTR _t1$[esp+52]
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 339  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 340  : 
    ; 341  :     // 将带符号紧缩字限制在[iLow,iHigh]区间
    ; 342  :     printf("md_clamp_miw:\n");
    
        push    OFFSET FLAT:??_C@_0P@DEEP@md_clamp_miw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 343  :     t0 = _mm_set_pi16(-15, 1, 254, 257);
    
        mov    edi, 254                ; 000000feH
        mov    WORD PTR -24+[esp+68], 257        ; 00000101H
        mov    WORD PTR -24+[esp+70], di
        mov    WORD PTR -24+[esp+72], 1
        mov    WORD PTR -24+[esp+74], -15        ; fffffff1H
        movq    mm0, MMWORD PTR -24+[esp+68]
        movq    MMWORD PTR _t0$[esp+68], mm0
    
    ; 344  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+68]
        mov    ecx, DWORD PTR _t0$[esp+72]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 32                    ; 00000020H
    
    ; 345  :     for(i=0; i<cnt; ++i)
    
        test    esi, esi
        jle    SHORT $L43871
    
    ; 346  :     {
    ; 347  :         t1 = md_clamp_miw(t0, -1, 255);
    
        or    dx, -1
        mov    ax, -257                ; fffffeffH
        movd    mm0, dx
        mov    cx, 32512                ; 00007f00H
        movq    mm1, mm0
        mov    dx, -32768                ; ffff8000H
        punpcklwd mm1, mm0
        movq    mm0, mm1
        punpcklwd mm1, mm0
        movd    mm0, ax
        movq    mm2, mm0
        punpcklwd mm2, mm0
        movq    mm0, mm2
        punpcklwd mm2, mm0
        movd    mm0, cx
        movq    mm3, mm0
        punpcklwd mm3, mm0
        movq    mm0, mm3
        punpcklwd mm3, mm0
        movd    mm0, dx
        movq    mm4, mm0
        punpcklwd mm4, mm0
        movq    mm0, mm4
        punpcklwd mm4, mm0
        movq    mm0, MMWORD PTR _t0$[esp+48]
        paddw    mm0, mm4
        paddusw    mm0, mm3
        psubusw    mm0, mm2
        paddw    mm0, mm1
        movq    MMWORD PTR _t1$[esp+48], mm0
    $L43871:
    
    ; 348  :     }
    ; 349  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    
        mov    eax, DWORD PTR _t1$[esp+48]
        mov    ecx, DWORD PTR _t1$[esp+52]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 350  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 351  : 
    ; 352  :     // 将无符号紧缩字限制在[uLow,uHigh]区间
    ; 353  :     printf("md_clamp_muw:\n");
    
        push    OFFSET FLAT:??_C@_0P@NOLG@md_clamp_muw?3?6?$AA@ ; `string'
        call    _printf
    
    ; 354  :     t0 = _mm_set_pi16(1, 254, 257, 32769U);
    
        mov    WORD PTR -24+[esp+68], -32767        ; ffff8001H
        mov    WORD PTR -24+[esp+70], 257        ; 00000101H
        mov    WORD PTR -24+[esp+72], di
        mov    WORD PTR -24+[esp+74], 1
        movq    mm0, MMWORD PTR -24+[esp+68]
        movq    MMWORD PTR _t0$[esp+68], mm0
    
    ; 355  :     printf("[%.8X%.8X] -> ", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    edx, DWORD PTR _t0$[esp+68]
        mov    eax, DWORD PTR _t0$[esp+72]
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0P@GKNG@?$FL?$CF?48X?$CF?48X?$FN?5?9?$DO?5?$AA@ ; `string'
        call    _printf
        add    esp, 32                    ; 00000020H
    
    ; 356  :     for(i=0; i<cnt; ++i)
    
        test    esi, esi
        jle    SHORT $L43877
    
    ; 357  :     {
    ; 358  :         t1 = md_clamp_muw(t0, 16, 255);
    
        mov    cx, 16                    ; 00000010H
        mov    dx, -240                ; ffffff10H
        movd    mm0, cx
        mov    ax, -256                ; ffffff00H
        movq    mm1, mm0
        punpcklwd mm1, mm0
        movq    mm0, mm1
        punpcklwd mm1, mm0
        movd    mm0, dx
        movq    mm2, mm0
        punpcklwd mm2, mm0
        movq    mm0, mm2
        punpcklwd mm2, mm0
        movd    mm0, ax
        movq    mm3, mm0
        punpcklwd mm3, mm0
        movq    mm0, mm3
        punpcklwd mm3, mm0
        movq    mm0, MMWORD PTR _t0$[esp+48]
        paddusw    mm0, mm3
        psubusw    mm0, mm2
        paddw    mm0, mm1
        movq    MMWORD PTR _t1$[esp+48], mm0
    $L43877:
    
    ; 359  :     }
    ; 360  :     printf("[%.8X%.8X]\n", t1.m64_u32[1], t1.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t1$[esp+48]
        mov    edx, DWORD PTR _t1$[esp+52]
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 361  :     printf("\n");
    
        push    OFFSET FLAT:??_C@_01BJG@?6?$AA@        ; `string'
        call    _printf
    
    ; 362  : 
    ; 363  :     // 返回常数:0
    ; 364  :     printf("md_setzero_mmq:\t");
    
        push    OFFSET FLAT:??_C@_0BB@BLNI@md_setzero_mmq?3?7?$AA@ ; `string'
        call    _printf
    
    ; 365  :     t0 = md_setzero_mmq();
    
        pxor    mm0, mm0
        movq    MMWORD PTR -24+[esp+68], mm0
        movq    mm1, mm0
        pxor    mm0, mm1
        movq    MMWORD PTR _t0$[esp+68], mm0
    
    ; 366  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+68]
        mov    ecx, DWORD PTR _t0$[esp+72]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 367  : 
    ; 368  :     // 返回常数:全1
    ; 369  :     printf("md_setfull_mmq:\t");
    
        push    OFFSET FLAT:??_C@_0BB@ICKB@md_setfull_mmq?3?7?$AA@ ; `string'
        call    _printf
    
    ; 370  :     t0 = md_setfull_mmq();
    
        movq    mm0, MMWORD PTR -24+[esp+84]
        movq    mm1, mm0
        pcmpeqb    mm0, mm1
        movq    MMWORD PTR _t0$[esp+84], mm0
    
    ; 371  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    edx, DWORD PTR _t0$[esp+84]
        mov    eax, DWORD PTR _t0$[esp+88]
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 372  : 
    ; 373  :     // 返回常数:每个紧缩字节为1
    ; 374  :     printf("md_set_1_mib:\t");
    
        push    OFFSET FLAT:??_C@_0P@NKIN@md_set_1_mib?3?7?$AA@ ; `string'
        call    _printf
    
    ; 375  :     t0 = md_set_1_mib();
    
        movq    mm0, MMWORD PTR -24+[esp+100]
        movq    mm1, mm0
        movq    mm2, mm0
        pcmpeqb    mm2, mm1
        psubb    mm0, mm2
        movq    MMWORD PTR _t0$[esp+100], mm0
    
    ; 376  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    ecx, DWORD PTR _t0$[esp+100]
        mov    edx, DWORD PTR _t0$[esp+104]
        push    ecx
        push    edx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
        add    esp, 64                    ; 00000040H
    
    ; 377  : 
    ; 378  :     // 返回常数:每个紧缩字为pow(2,n)-1
    ; 379  :     printf("md_set_pow2n_sub1_miw:\t");
    
        push    OFFSET FLAT:??_C@_0BI@DPAE@md_set_pow2n_sub1_miw?3?7?$AA@ ; `string'
        call    _printf
    
    ; 380  :     t0 = md_set_pow2n_sub1_miw(8);
    
        movq    mm0, MMWORD PTR -24+[esp+52]
        movq    mm1, mm0
        pcmpeqb    mm0, mm1
        psrlw    mm0, 8
        movq    MMWORD PTR _t0$[esp+52], mm0
    
    ; 381  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    eax, DWORD PTR _t0$[esp+52]
        mov    ecx, DWORD PTR _t0$[esp+56]
        push    eax
        push    ecx
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
    
    ; 382  : 
    ; 383  :     // 返回常数:每个紧缩字为pow(2,n)-1
    ; 384  :     printf("md_set_neg_pow2n_miw:\t");
    
        push    OFFSET FLAT:??_C@_0BH@NLNM@md_set_neg_pow2n_miw?3?7?$AA@ ; `string'
        call    _printf
    
    ; 385  :     t0 = md_set_neg_pow2n_miw(15);
    
        movq    mm0, MMWORD PTR -24+[esp+68]
        movq    mm1, mm0
        pcmpeqb    mm0, mm1
        psllw    mm0, 15                    ; 0000000fH
        movq    MMWORD PTR _t0$[esp+68], mm0
    
    ; 386  :     printf("[%.8X%.8X]\n", t0.m64_u32[1], t0.m64_u32[0]);
    
        mov    edx, DWORD PTR _t0$[esp+68]
        mov    eax, DWORD PTR _t0$[esp+72]
        push    edx
        push    eax
        push    OFFSET FLAT:??_C@_0M@GLHH@?$FL?$CF?48X?$CF?48X?$FN?6?$AA@ ; `string'
        call    _printf
        add    esp, 32                    ; 00000020H
    
    ; 387  : 
    ; 388  : }
    
        pop    edi
        pop    esi
        mov    esp, ebp
        pop    ebp
        ret    0
    ?doTest@@YAXH@Z ENDP                    ; doTest
    _TEXT    ENDS
    PUBLIC    _main
    EXTRN    _rand:NEAR
    ; Function compile flags: /Ogty
    ;    COMDAT _main
    _TEXT    SEGMENT
    _main    PROC NEAR                    ; COMDAT
    
    ; 392  :     doTest((rand()&1) + 1);    // 用一个随机数作为循环次数,避免编译器优化循环
    
        call    _rand
        and    eax, 1
        inc    eax
        push    eax
        call    ?doTest@@YAXH@Z                ; doTest
        add    esp, 4
    
    ; 393  :     return 0;
    
        xor    eax, eax
    
    ; 394  : }
    
        ret    0
    _main    ENDP
    _TEXT    ENDS
    END
  • 相关阅读:
    Nodejs
    webpack与gulp的区别
    gulpjs
    Commonjs、AMD、CMD
    建造者模式
    工厂模式
    设计模式分类
    python的接口
    Python代码教你批量将PDF转为Word
    什么是“堆”,"栈","堆栈","队列",它们的区别?
  • 原文地址:https://www.cnblogs.com/zyl910/p/md01_mmxguide_ch08.html
Copyright © 2011-2022 走看看