更多详情见——
http://www.cnblogs.com/zyl910/archive/2012/04/26/md00.html
SIMD函数整理:00 索引贴
R:寄存器。M:64位MM寄存器;X:128位XMM寄存器;Y:256位YMM寄存器。
Name:函数名。
Name2:另一种函数名。
功能:功能描述。
Asm:汇编指令。
PCode:伪代码。
R | Name | Name2 | 功能 | Asm | PCode |
X | _MM_SHUFFLE | 混洗的掩码.4 | # | http://msdn.microsoft.com/en-us/library/4d3eabky(vs.71).aspx | |
X | _MM_TRANSPOSE4_PS | 矩阵转置.4x4 | # | http://msdn.microsoft.com/en-us/library/5hah127h(v=vs.71).aspx | |
X | _MM_SET_EXCEPTION_STATE | 状态.设置异常状态 | # | http://msdn.microsoft.com/en-us/library/s61ysx0a(v=vs.71).aspx | |
X | _MM_GET_EXCEPTION_STATE | 状态.取得异常状态 | # | http://msdn.microsoft.com/en-us/library/7kzfa3h8(v=vs.71).aspx | |
X | _MM_SET_EXCEPTION_MASK | 状态.设置异常掩码 | # | http://msdn.microsoft.com/en-us/library/7ad8d8fy(v=vs.71).aspx | |
X | _MM_GET_EXCEPTION_MASK | 状态.取得异常掩码 | # | http://msdn.microsoft.com/en-us/library/f13f3eaz(v=vs.71).aspx | |
X | _MM_SET_ROUNDING_MODE | 状态.设置舍入模式 | # | http://msdn.microsoft.com/en-us/library/y70z2105(v=vs.71).aspx | |
X | _MM_GET_ROUNDING_MODE | 状态.取得舍入模式 | # | http://msdn.microsoft.com/en-us/library/wc7hx623(v=vs.71).aspx | |
X | _MM_SET_FLUSH_ZERO_MODE | 状态.设置下溢清零模式 | # | http://msdn.microsoft.com/en-us/library/a8b5ts9s(v=vs.71).aspx | |
X | _MM_GET_FLUSH_ZERO_MODE | 状态.取得下溢清零模式 | # | http://msdn.microsoft.com/en-us/library/5207b86e(v=vs.71).aspx | |
X | _mm_add_ss | 加法.单精.标量 | ADDSS | r.fS[0]=m1.fS[0]+m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_add_ps | 加法.单精.紧缩 | ADDPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]+m2.fS[i]; } | |
X | _mm_sub_ss | 减法.单精.标量 | SUBSS | r.fS[0]=m1.fS[0]-m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_sub_ps | 减法.单精.紧缩 | SUBPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]-m2.fS[i]; } | |
X | _mm_mul_ss | 乘法.单精.标量 | MULSS | r.fS[0]=m1.fS[0]*m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_mul_ps | 乘法.单精.紧缩 | MULPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]*m2.fS[i]; } | |
X | _mm_div_ss | 除法.单精.标量 | DIVSS | r.fS[0]=m1.fS[0]/m2.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_div_ps | 除法.单精.紧缩 | DIVPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]/m2.fS[i]; } | |
X | _mm_sqrt_ss | 平方根.单精.标量 | SQRTSS | r.fS[0]=sqrt(m1.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_sqrt_ps | 平方根.单精.紧缩 | SQRTPS | for(i=0;i<4;++i){ r.fS[i]=sqrt(m1.fS[i]); } | |
X | _mm_rcp_ss | 倒数.单精.标量 | RCPSS | r.fS[0]=1/m1.fS[0]; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_rcp_ps | 倒数.单精.紧缩 | RCPPS | for(i=0;i<4;++i){ r.fS[i]=1/m1.fS[i]; } | |
X | _mm_rsqrt_ss | 平方根的倒数.单精.标量 | RSQRTSS | r.fS[0]=1/sqrt(m1.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_rsqrt_ps | 平方根的倒数.单精.紧缩 | RSQRTPS | for(i=0;i<4;++i){ r.fS[i]=1/sqrt(m1.fS[i]); } | |
X | _mm_min_ss | 最小值.单精.标量 | MINSS | r.fS[0]=min(m1.fS[0], m2.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_min_ps | 最小值.单精.紧缩 | MINPS | for(i=0;i<4;++i){ r.fS[i]=min(m1.fS[i], m2.fS[i]); } | |
X | _mm_max_ss | 最大值.单精.标量 | MAXSS | r.fS[0]=max(m1.fS[0], m2.fS[0]); for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_max_ps | 最大值.单精.紧缩 | MAXPS | for(i=0;i<4;++i){ r.fS[i]=max(m1.fS[i], m2.fS[i]); } | |
X | _mm_and_ps | 逻辑位与.单精 | ANDPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]&m2.fS[i]; } | |
X | _mm_andnot_ps | 逻辑位与非.单精 | ANDNPS | for(i=0;i<4;++i){ r.fS[i]=(!m1.fS[i])&m2.fS[i]; } | |
X | _mm_or_ps | 逻辑位或.单精 | ORPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]|m2.fS[i]; } | |
X | _mm_xor_ps | 逻辑位异或.单精 | XORPS | for(i=0;i<4;++i){ r.fS[i]=m1.fS[i]^m2.fS[i]; } | |
X | _mm_cmpeq_ss | 比较.等于.单精.标量 | CMPEQSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpeq_ps | 比较.等于.单精.紧缩 | CMPEQPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmplt_ss | 比较.小于.单精.标量 | CMPLTSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmplt_ps | 比较.小于.单精.紧缩 | CMPLTPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmple_ss | 比较.小于等于.单精.标量 | CMPLESS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmple_ps | 比较.小于等于.单精.紧缩 | CMPLEPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpgt_ss | 比较.大于.单精.标量 | CMPLTSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpgt_ps | 比较.大于.单精.紧缩 | CMPLTPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpge_ss | 比较.大于等于.单精.标量 | CMPLESS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpge_ps | 比较.大于等于.单精.紧缩 | CMPLEPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpneq_ss | 比较.不等于.单精.标量 | CMPNEQSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpneq_ps | 比较.不等于.单精.紧缩 | CMPNEQPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpnlt_ss | 比较.不小于.单精.标量 | CMPNLTSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpnlt_ps | 比较.不小于.单精.紧缩 | CMPNLTPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpnle_ss | 比较.不小于等于.单精.标量 | CMPNLESS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpnle_ps | 比较.不小于等于.单精.紧缩 | CMPNLEPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpngt_ss | 比较.不大于.单精.标量 | CMPNLTSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpngt_ps | 比较.不大于.单精.紧缩 | CMPNLTPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpnge_ss | 比较.不大于等于.单精.标量 | CMPNLESS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpnge_ps | 比较.不大于等于.单精 | CMPNLEPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpord_ss | 比较.有序.单精.标量 | CMPORDSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpord_ps | 比较.有序.单精.紧缩 | CMPORDPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_cmpunord_ss | 比较.无序.单精.标量 | CMPUNORDSS | r = BM(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cmpunord_ps | 比较.无序.单精.紧缩 | CMPUNORDPS | for(i=0;i<4;++i){ r.fS[i]=BM(m1.fS[i] @ m2.fS[i]); } | |
X | _mm_comieq_ss | 有序比较并设标志.相等.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_comilt_ss | 有序比较并设标志.小于.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_comile_ss | 有序比较并设标志.小于等于.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_comigt_ss | 有序比较并设标志.大于.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_comige_ss | 有序比较并设标志.大于等于.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_comineq_ss | 有序比较并设标志.不等于.单精 | COMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomieq_ss | 无序比较并设标志.相等.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomilt_ss | 无序比较并设标志.小于.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomile_ss | 无序比较并设标志.小于等于.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomigt_ss | 无序比较并设标志.大于.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomige_ss | 无序比较并设标志.大于等于.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_ucomineq_ss | 无序比较并设标志.不等于.单精 | UCOMISS | r = EFLAGS(m1.fS[0] @ m2.fS[0]) | |
X | _mm_cvt_ss2si | _mm_cvtss_si32 | 转换.单精度至符32位.标量 | CVTSS2SI | r=(int32)m1.fS[0] |
X | _mm_cvt_ps2pi | _mm_cvtps_pi32 | 转换.单精度至符32位.低位2个 | CVTPS2PI | for(i=0;i<2;++i){ r.iD[i]=(int32)m1.fS[i]; } |
X | _mm_cvtt_ss2si | _mm_cvttss_si32 | 截尾法转换.单精度至符32位.标量 | CVTTSS2SI | r=(int32)TRUNC(m1.fS[0]) |
X | _mm_cvtt_ps2pi | _mm_cvttps_pi32 | 截尾法转换.单精度至符32位.低位2个 | CVTTPS2PI | for(i=0;i<2;++i){ r.iD[i]=(int32)TRUNC(m1.fS[i]); } |
X | _mm_cvt_si2ss | _mm_cvtsi32_ss | 转换.符32位至单精度.标量 | CVTSI2SS | r.fS[0]=(float)m2; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } |
X | _mm_cvt_pi2ps | _mm_cvtpi32_ps | 转换.符32位至单精度.低位2个 | CVTPI2PS | for(i=0;i<2;++i){ r.fS[i]=(float)m2.iD[i]; } for(i=2;i<4;++i){ r.fS[i]=m1.fS[i]; } |
X | _mm_cvtss_f32 | 转换.提取低32位的单精度浮点数 | r=m1.fS[0] | ||
X | _mm_cvtss_si64 | 转换.单精度至符64位.标量 | CVTSS2SI | r=(int64)m1.fS[0] | |
X | _mm_cvttss_si64 | 截尾法转换.单精度至符32位.标量 | CVTTSS2SI | r=(int64)TRUNC(m1.fS[0]) | |
X | _mm_cvtsi64_ss | 转换.符64位至单精度.标量 | CVTSI2SS | r.fS[0]=(float)m2; for(i=1;i<4;++i){ r.fS[i]=m1.fS[i]; } | |
X | _mm_shuffle_ps | 混洗.单精.2源 | SHUFPS | for(i=0;i<2;++i){ r.fS[i]=m1.fS[(_Imm8>>(i*2)) & 3]; } for(i=2;i<4;++i){ r.fS[i]=m2.fS[(_Imm8>>(i*2)) & 3]; } | |
X | _mm_unpackhi_ps | 高位解包.单精 | UNPCKHPS | for(i=0;i<1;++i){ r.fS[i*2]=m1.fS[2+i]; r.fS[i*2+1]=m2.fS[2+i]; } | |
X | _mm_unpacklo_ps | 低位解包.单精 | UNPCKLPS | for(i=0;i<1;++i){ r.fS[i*2]=m1.fS[i]; r.fS[i*2+1]=m2.fS[i]; } | |
X | _mm_loadh_pi | 高位传送.加载64位 | MOVHPS reg, mem | m1.mQ[1]=*m2; | |
X | _mm_movehl_ps | 高到低传送.高位2组 | MOVHLPS | r=m1; for(i=0;i<2;++i){ r.fS[i]=m2.fS[2+i]; } | |
X | _mm_movelh_ps | 低到高传送.低位2组 | MOVLHPS | r=m1; for(i=0;i<2;++i){ r.fS[2+i]=m2.fS[2]; } | |
X | _mm_storeh_pi | 高位传送.存储64位 | MOVHPS mem, reg | *A=m2.mQ[1]; | |
X | _mm_loadl_pi | 低位传送.加载64位 | MOVLPS reg, mem | m1.mQ[0]=*m2; | |
X | _mm_storel_pi | 低位传送.存储64位 | MOVLPS mem, reg | *A=m2.mQ[0]; | |
X | _mm_movemask_ps | 传送符号位生成掩码.单精 | MOVMSKPS | r=0; for(i=0;i<4;++i){ r<<=1; r|=SBIT(m1.fS[i]); } | |
M | _m_pextrw | _mm_extract_pi16 | 传送.提取.16位 | PEXTRW | r = ZX(m1.uW[imm8]) |
M | _m_pinsrw | _mm_insert_pi16 | 传送.插入.16位 | PINSRW | m1.uW[imm8]=(WORD)m2 |
M | _m_pmaxsw | _mm_max_pi16 | 最大.带16位.紧缩 | PMAXSW | for(i=0;i<4;++i){ r.iW[i]=MAX(m1.iW[i],m2.iW[i]); } |
M | _m_pmaxub | _mm_max_pu8 | 最大.无8位.紧缩 | PMAXUB | for(i=0;i<8;++i){ r.uB[i]=MAX(m1.uB[i],m2.uB[i]); } |
M | _m_pminsw | _mm_min_pi16 | 最小.带16位.紧缩 | PMINSW | for(i=0;i<4;++i){ r.iW[i]=MIN(m1.iW[i],m2.iW[i]); } |
M | _m_pminub | _mm_min_pu8 | 最小.无8位.紧缩 | PMINUB | for(i=0;i<8;++i){ r.uB[i]=MIN(m1.uB[i],m2.uB[i]); } |
M | _m_pmovmskb | _mm_movemask_pi8 | 传送符号位生成掩码.字节 | PMOVMSKB | r=0; for(i=0;i<8;++i){ r<<=1; r|=SBIT(m1.iB[i]); } |
M | _m_pmulhuw | _mm_mulhi_pu16 | 乘法高位.无16位 | PMULHUW | for(i=0;i<4;++i){ r.uW[i]=hi16(m1.uW[i]*m1.uW[i]); } |
M | _m_pshufw | _mm_shuffle_pi16 | 混洗.字.1源 | PSHUFW | for(i=0;i<3;++i){ r.uW[i]=m1.uW[(imm8>>(i*2)) & 3]; } |
M | _m_maskmovq | _mm_maskmove_si64 | 选择性传送.8字节 | MASKMOVQ | for(i=0;i<8;++i){ if(SBIT(m2.iB[i])) P[i]=m1.iB[i]; } |
M | _m_pavgb | _mm_avg_pu8 | 平均值.无8位 | PAVGB | for(i=0;i<8;++i){ r.uB[i]=AVG(m1.uB[i],m2.uB[i]); } |
M | _m_pavgw | _mm_avg_pu16 | 平均值.无16位 | PAVGW | for(i=0;i<4;++i){ r.uW[i]=AVG(m1.uW[i],m2.uW[i]); } |
M | _m_psadbw | _mm_sad_pu8 | 绝对差.无8位,再水平8求和 | PSADBW | r=0; for(i=0;i<8;++i){ r.uW[0]+=ABS((WORD)m1.uB[i] - m2.uB[i]); } |
X | _mm_set_ss | 赋值.单精.标量 | r.fS[0]=arg[0]; for(i=1;i<4;++i){ r.fS[i]=0; } | ||
X | _mm_set_ps1 | _mm_set1_ps | 重复赋值.单精.紧缩 | for(i=0;i<4;++i){ r.fS[i]=arg[0]); } | |
X | _mm_set_ps | 赋值.单精.紧缩 | for(i=0;i<4;++i){ r.fS[i]=arg[i]); } | ||
X | _mm_setr_ps | 逆序赋值.单精.紧缩 | for(i=0;i<4;++i){ r.fS[i]=arg[i?]); } | ||
X | _mm_setzero_ps | 赋值为零.单精.紧缩 | r=0 | ||
X | _mm_load_ss | 加载.单精.标量 | MOVSS | r.fS[0]=_A[0]; for(i=1;i<4;++i){ r.fS[i]=0; } | |
X | _mm_load_ps1 | _mm_load1_ps | 重复加载.单精.紧缩 | MOVSS + Shuffling | for(i=0;i<4;++i){ r.fS[i]=_A[0]); } |
X | _mm_load_ps | 加载.单精.紧缩.对齐 | MOVAPS | for(i=0;i<4;++i){ r.fS[i]=_A[i]); } | |
X | _mm_loadr_ps | 逆序加载.单精.紧缩.对齐 | MOVAPS + Shuffling | for(i=0;i<4;++i){ r.fS[i]=_A[3-i]); } | |
X | _mm_loadu_ps | 加载.单精.紧缩.非对齐 | MOVUPS | for(i=0;i<4;++i){ r.fS[i]=_A[i]); } | |
X | _mm_store_ss | 存储.单精.标量 | MOVSS | _A[0]=m1.fS[0] | |
X | _mm_store_ps1 | _mm_store1_ps | 重复存储.单精.紧缩 | MOVSS + Shuffling | for(i=0;i<4;++i){ _A[i]=m1.fS[0]); } |
X | _mm_store_ps | 存储.单精.紧缩.对齐 | MOVAPS | for(i=0;i<4;++i){ _A[i]=m1.fS[i]); } | |
X | _mm_storer_ps | 逆序存储.单精.紧缩.对齐 | MOVAPS + Shuffling | for(i=0;i<4;++i){ _A[i]=m1.fS[3-i]); } | |
X | _mm_storeu_ps | 存储.单精.紧缩.非对齐 | MOVUPS | for(i=0;i<4;++i){ _A[i]=m1.fS[i]); } | |
X | _mm_move_ss | 标量传送.单精 | MOVSS | m1.fS[0]=m2.fS[0] | |
_mm_prefetch | 缓存.预取 | PREFETCH | http://msdn.microsoft.com/en-us/library/84szxsww(v=vs.110).aspx | ||
M | _mm_stream_pi | 非时间性存储.mm | MOVNTQ | *_A=m1 | |
X | _mm_stream_ps | 非时间性存储.单精度 | MOVNTPS | *_A=m1 | |
_mm_sfence | 存储隔离 | SFENCE | http://msdn.microsoft.com/en-us/library/5h2w73d1(v=vs.110).aspx | ||
_mm_getcsr | 获取MXCSR | STMXCSR | r=MXCSR | ||
_mm_setcsr | 设置MXCSR | LDMXCSR | MXCSR=m1 | ||
_mm_malloc | mm分配内存(IGL) | #IGL | |||
_mm_free | mm释放内存(IGL) | #IGL | |||
X | _mm_cvtpi16_ps | 转换.符16位至单精度.紧缩4个 | _inline | for(i=0;i<4;++i){ r.fS[i]=(float)m1.iW[i]); } | |
X | _mm_cvtpu16_ps | 转换.无16位至单精度.紧缩4个 | _inline | for(i=0;i<4;++i){ r.fS[i]=(float)m1.uW[i]); } | |
X | _mm_cvtps_pi16 | 转换.单精度至符16位.紧缩4个 | _inline | _mm_packs_pi32(_mm_cvtps_pi32(a), _mm_cvtps_pi32(_mm_movehl_ps(a, a))); | |
X | _mm_cvtpi8_ps | 转换.符8位至单精度.低位4个 | _inline | _mm_cvtpi16_ps(_mm_unpacklo_pi8(a, _mm_cmpgt_pi8(_mm_setzero_si64(), a))); | |
X | _mm_cvtpu8_ps | 转换.无8位至单精度.低位4个 | _inline | _mm_cvtpu16_ps(_mm_unpacklo_pi8(a, _mm_setzero_si64())); | |
X | _mm_cvtps_pi8 | 转换.单精度至符8位.低位4个 | _inline | _mm_packs_pi16(_mm_cvtps_pi16(a), _mm_setzero_si64()); | |
X | _mm_cvtpi32x2_ps | 转换.符32位至单精度.2源 | _inline | _mm_movelh_ps(_mm_cvt_pi2ps(_mm_setzero_ps(), a), _mm_cvt_pi2ps(_mm_setzero_ps(), b)); |