分析了一下编译器(VC8)生成的汇编代码,发现问题是编译器造成的。。。
在没有用SSE优化的情况下,编译器生成的代码非常好,直逼手写代码。。。
而在用了SSE优化以后,编译器不能合理的分配和利用寄存器,并且生成了很多冗余的指令,导致代码简直一团糟,接近于弱智水平。。。
具体可参看下面的汇编代码,我在重要的行后面加了注释,方便理解。。。
没用SSE并行:
; COMDAT ?intersectEye@RayTriangle@@QAE_NPAVe_Ray@@PAVe_RayState@@@Z
_TEXT SEGMENT
_t$ = -56 ; size = 4
_dot_nd$ = -52 ; size = 4
_hit$ = -48 ; size = 12
_ray_dir$ = -36 ; size = 12
_ray_src$ = -24 ; size = 12
_bary$ = -12 ; size = 12
?intersectEye@RayTriangle@@QAE_NPAVe_Ray@@PAVe_RayState@@@Z PROC NEAR ; RayTriangle::intersectEye, COMDAT
; _this$ = ecx
; _ray$ = eax
; _state$ = edi
; 146 : {
sub esp, 56 ; 00000038H
; 147 : float dot_nd, t, t_near, t_far, near_clip, far_clip;
; 148 : Vector3f ray_dir, ray_src;
; 149 :
; 150 : prefetch( (char const *)(this + 1) );
; 151 :
; 152 : ray_dir = ray->dir;
; 153 :
; 154 : dot_nd = - ( normal.x * ray_dir.x + normal.y * ray_dir.y + normal.z * ray_dir.z );
xorps xmm3, xmm3
push ebx
mov ebx, eax
lea eax, DWORD PTR [ebx+24]
push esi
mov esi, ecx
mov ecx, DWORD PTR [eax]
mov edx, DWORD PTR [eax+4]
mov eax, DWORD PTR [eax+8]
movss xmm0, DWORD PTR [esi+8]
movss xmm1, DWORD PTR [esi+4]
prefetchnta BYTE PTR [esi+48]
mov DWORD PTR _ray_dir$[esp+64], ecx
mov DWORD PTR _ray_dir$[esp+68], edx
mulss xmm1, DWORD PTR _ray_dir$[esp+68]
mov DWORD PTR _ray_dir$[esp+72], eax
mulss xmm0, DWORD PTR _ray_dir$[esp+72]
addss xmm0, xmm1
movss xmm1, DWORD PTR [esi]
mulss xmm1, DWORD PTR _ray_dir$[esp+64]
; 155 :
; 156 : ray_src = ray->src;
mov ecx, ebx
mov edx, DWORD PTR [ecx]
mov eax, DWORD PTR [ecx+4]
mov ecx, DWORD PTR [ecx+8]
addss xmm0, xmm1
movaps xmm1, xmm3
subss xmm1, xmm0
; 157 :
; 158 : if( dot_nd <= 0.0f )
comiss xmm3, xmm1
movss DWORD PTR _dot_nd$[esp+64], xmm1
mov DWORD PTR _ray_src$[esp+64], edx
mov DWORD PTR _ray_src$[esp+68], eax
mov DWORD PTR _ray_src$[esp+72], ecx
; 159 : return false;
jae $L134657
; 160 :
; 161 : t = ray_src.x * normal.x + ray_src.y * normal.y + ray_src.z * normal.z + normal.w;
movss xmm7, DWORD PTR [esi+8]
mulss xmm7, DWORD PTR _ray_src$[esp+72]
movss xmm0, DWORD PTR [esi+4]
mulss xmm0, DWORD PTR _ray_src$[esp+68]
addss xmm7, xmm0
movss xmm0, DWORD PTR [esi]
mulss xmm0, DWORD PTR _ray_src$[esp+64]
addss xmm7, xmm0
addss xmm7, DWORD PTR [esi+12]
; 162 :
; 163 : t_near = ray->t_near;
movss xmm0, DWORD PTR [ebx+76]
; 164 :
; 165 : if( t <= t_near * dot_nd )
mulss xmm0, xmm1
comiss xmm0, xmm7
movss DWORD PTR _t$[esp+64], xmm7
; 166 : return false;
jae $L134657
; 167 :
; 168 : t_far = MIN( ray->t_far, state->t );
movss xmm0, DWORD PTR [edi+84]
comiss xmm0, DWORD PTR [ebx+80]
jbe SHORT $L169409
movss xmm0, DWORD PTR [ebx+80]
jmp SHORT $L169410
$L169409:
movss xmm0, DWORD PTR [edi+84]
$L169410:
; 169 :
; 170 : if( t >= t_far * dot_nd )
mulss xmm0, xmm1
comiss xmm7, xmm0
; 171 : return false;
jae $L134657
; 172 :
; 173 : // make sure the intersection in the bound box...
; 174 :
; 175 : Vector3f hit;
; 176 :
; 177 : hit.arr[ projX ] = ray_src.arr[ projX ] * dot_nd + ray_dir.arr[ projX ] * t;
movzx eax, BYTE PTR [esi+44]
add eax, eax
add eax, eax
movss xmm0, DWORD PTR _ray_dir$[esp+eax+64]
movss xmm2, DWORD PTR _ray_src$[esp+eax+64]
mulss xmm0, xmm7
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR _hit$[esp+eax+64], xmm0
; 178 : hit.arr[ projY ] = ray_src.arr[ projY ] * dot_nd + ray_dir.arr[ projY ] * t;
movzx eax, BYTE PTR [esi+45]
add eax, eax
add eax, eax
; 179 :
; 180 : // is the intersection in our triangular area...
; 181 :
; 182 : if( inited != TRUE )
cmp BYTE PTR [esi+47], 1
movss xmm0, DWORD PTR _ray_dir$[esp+eax+64]
movss xmm2, DWORD PTR _ray_src$[esp+eax+64]
mulss xmm0, xmm7
mulss xmm2, xmm1
addss xmm0, xmm2
movss DWORD PTR _hit$[esp+eax+64], xmm0
je SHORT $L134654
; 183 : prepare_intersect();
call ?prepare_intersect@RayTriangle@@QAEXXZ ; RayTriangle::prepare_intersect
movss xmm7, DWORD PTR _t$[esp+64]
movss xmm1, DWORD PTR _dot_nd$[esp+64]
xorps xmm3, xmm3
$L134654:
; 184 :
; 185 : Vector3f bary;
; 186 :
; 187 : bary.x = hit.arr[ projX ] * la.x + hit.arr[ projY ] * la.y + la.z * dot_nd;
movzx eax, BYTE PTR [esi+45]
movzx ecx, BYTE PTR [esi+44]
movss xmm6, DWORD PTR [esi+20]
mulss xmm6, DWORD PTR _hit$[esp+eax*4+64]
movss xmm0, DWORD PTR [esi+24]
lea eax, DWORD PTR _hit$[esp+eax*4+64]
mulss xmm0, xmm1
addss xmm6, xmm0
movss xmm0, DWORD PTR [esi+16]
mulss xmm0, DWORD PTR _hit$[esp+ecx*4+64]
lea ecx, DWORD PTR _hit$[esp+ecx*4+64]
addss xmm6, xmm0
; 188 :
; 189 : if( bary.x < 0.0f || bary.x > dot_nd )
comiss xmm3, xmm6
ja $L134657
comiss xmm6, xmm1
ja $L134657
; 191 :
; 192 : bary.y = hit.arr[ projX ] * lb.x + hit.arr[ projY ] * lb.y + lb.z * dot_nd;
movss xmm5, DWORD PTR [esi+32]
mulss xmm5, DWORD PTR [eax]
movss xmm0, DWORD PTR [esi+36]
mulss xmm0, xmm1
addss xmm5, xmm0
movss xmm0, DWORD PTR [esi+28]
mulss xmm0, DWORD PTR [ecx]
addss xmm5, xmm0
; 193 :
; 194 : if( bary.y < 0.0f || bary.y > dot_nd )
comiss xmm3, xmm5
ja $L134657
comiss xmm5, xmm1
ja $L134657
; 195 : return false;
; 196 :
; 197 : bary.z = dot_nd - bary.x - bary.y;
movaps xmm4, xmm1
subss xmm4, xmm6
subss xmm4, xmm5
; 198 :
; 199 : if( bary.z < 0.0f || bary.z > dot_nd )
comiss xmm3, xmm4
ja $L134657
comiss xmm4, xmm1
ja $L134657
; 200 : return false;
; 201 :
; 202 : float r_dot_nd = 1.0f / dot_nd;
movss xmm0, DWORD PTR __real@3f800000
; 203 :
; 204 : t *= r_dot_nd;
; 205 : bary.x *= r_dot_nd;
; 206 : bary.y *= r_dot_nd;
; 207 : bary.z *= r_dot_nd;
; 208 : hit.arr[ projX ] *= r_dot_nd;
movzx eax, BYTE PTR [esi+44]
divss xmm0, xmm1
movaps xmm2, xmm0
mulss xmm2, xmm7
movaps xmm7, xmm0
mulss xmm7, xmm6
movaps xmm6, xmm0
lea eax, DWORD PTR _hit$[esp+eax*4+64]
mulss xmm6, xmm5
movaps xmm5, xmm0
mulss xmm5, xmm4
movss xmm4, DWORD PTR [eax]
mulss xmm4, xmm0
movss DWORD PTR [eax], xmm4
; 209 : hit.arr[ projY ] *= r_dot_nd;
movzx eax, BYTE PTR [esi+45]
movss xmm4, DWORD PTR _hit$[esp+eax*4+64]
lea eax, DWORD PTR _hit$[esp+eax*4+64]
mulss xmm4, xmm0
; 210 : dot_nd = - dot_nd;
; 211 :
; 212 : near_clip = g_Rend.scene->opt.view.clip.min;
; 213 :
; 214 : float dist = ray->dad * t;
movss xmm0, DWORD PTR [ebx+68]
movss DWORD PTR [eax], xmm4
mov eax, DWORD PTR ?g_Rend@@3Ve_Renderer@@A+780
; 215 :
; 216 : far_clip = g_Rend.scene->opt.view.clip.max;
movss xmm4, DWORD PTR [eax+36]
subss xmm3, xmm1
movss xmm1, DWORD PTR [eax+32]
mulss xmm0, xmm2
; 217 :
; 218 : if( dist < near_clip || dist > far_clip )
comiss xmm1, xmm0
movss DWORD PTR _bary$[esp+64], xmm7
movss DWORD PTR _bary$[esp+68], xmm6
movss DWORD PTR _bary$[esp+72], xmm5
ja $L134657
comiss xmm0, xmm4
ja $L134657
; 219 : return false;
; 220 :
; 221 : hit.arr[ projZ ] = ray_src.arr[ projZ ] + ray_dir.arr[ projZ ] * t;
movzx eax, BYTE PTR [esi+46]
; 222 :
; 223 : state->normBary = bary;
mov ecx, DWORD PTR _bary$[esp+68]
mov ebx, DWORD PTR _bary$[esp+72]
add eax, eax
add eax, eax
movss xmm0, DWORD PTR _ray_dir$[esp+eax+64]
mulss xmm0, xmm2
addss xmm0, DWORD PTR _ray_src$[esp+eax+64]
movss DWORD PTR _hit$[esp+eax+64], xmm0
mov eax, DWORD PTR _bary$[esp+64]
lea edx, DWORD PTR [edi+200]
mov DWORD PTR [edx], eax
mov DWORD PTR [edx+4], ecx
mov DWORD PTR [edx+8], ebx
; 224 :
; 225 : //if( dpTri ) { // it's generated by displacement...
; 226 :
; 227 : /*float bx, by, bz;
; 228 : bx = bary.x;
; 229 : by = bary.y;
; 230 : bz = bary.z;
; 231 : Vector3f **barys = dpTri->barys;
; 232 : bary.x = bx * barys[0]->x + by * barys[1]->x + bz * barys[2]->x;
; 233 : bary.y = bx * barys[0]->y + by * barys[1]->y + bz * barys[2]->y;
; 234 : bary.z = bx * barys[0]->z + by * barys[1]->z + bz * barys[2]->z;
; 235 :
; 236 : state->vxNormals[0] = dpTri->v1->normal;
; 237 : state->vxNormals[1] = dpTri->v2->normal;
; 238 : state->vxNormals[2] = dpTri->v3->normal;*/
; 239 :
; 240 : //} else {
; 241 :
; 242 : state->vxNormals[0] = tri->v1->normal;
mov edx, DWORD PTR [esi+40]
add edx, 12 ; 0000000cH
push ebp
mov ebp, DWORD PTR [edx]
lea ebx, DWORD PTR [edi+260]
mov DWORD PTR [ebx], ebp
mov ebp, DWORD PTR [edx+4]
mov DWORD PTR [ebx+4], ebp
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ebx+8], edx
; 243 : state->vxNormals[1] = tri->v2->normal;
mov edx, DWORD PTR [esi+40]
add edx, 36 ; 00000024H
mov ebp, DWORD PTR [edx]
lea ebx, DWORD PTR [edi+272]
mov DWORD PTR [ebx], ebp
mov ebp, DWORD PTR [edx+4]
mov DWORD PTR [ebx+4], ebp
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ebx+8], edx
; 244 : state->vxNormals[2] = tri->v3->normal;
mov edx, DWORD PTR [esi+40]
add edx, 60 ; 0000003cH
mov ebp, DWORD PTR [edx]
lea ebx, DWORD PTR [edi+284]
mov DWORD PTR [ebx], ebp
mov ebp, DWORD PTR [edx+4]
mov DWORD PTR [ebx+4], ebp
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ebx+8], edx
; 245 : //}
; 246 :
; 247 : // get the nearest hit point...
; 248 :
; 249 : state->P = hit/*.abc*/;
mov ebx, DWORD PTR _hit$[esp+68]
lea edx, DWORD PTR [edi+128]
mov DWORD PTR [edx], ebx
mov ebx, DWORD PTR _hit$[esp+72]
mov DWORD PTR [edx+4], ebx
mov ebx, DWORD PTR _hit$[esp+76]
mov DWORD PTR [edx+8], ebx
; 250 : state->t = t;
; 251 :
; 252 : state->dotNd = dot_nd;
; 253 : state->bary = bary;
lea edx, DWORD PTR [edi+188]
mov DWORD PTR [edx], eax
mov eax, DWORD PTR _bary$[esp+76]
mov DWORD PTR [edx+4], ecx
mov DWORD PTR [edx+8], eax
movss DWORD PTR [edi+84], xmm2
movss DWORD PTR [edi+224], xmm3
; 254 :
; 255 : state->Ng = normal.abc;
mov edx, esi
mov eax, DWORD PTR [edx]
lea ecx, DWORD PTR [edi+140]
mov DWORD PTR [ecx], eax
mov eax, DWORD PTR [edx+4]
mov DWORD PTR [ecx+4], eax
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ecx+8], edx
; 256 : state->N = normal.abc;
mov ecx, esi
mov edx, DWORD PTR [ecx]
lea eax, DWORD PTR [edi+152]
mov DWORD PTR [eax], edx
mov edx, DWORD PTR [ecx+4]
mov DWORD PTR [eax+4], edx
mov ecx, DWORD PTR [ecx+8]
mov DWORD PTR [eax+8], ecx
; 257 :
; 258 : // for 3ds max...
; 259 : state->faceNum = tri->faceNum;
mov edx, DWORD PTR [esi+40]
mov eax, DWORD PTR [edx+144]
mov DWORD PTR [edi+256], eax
; 260 : state->hitPri = tri;
mov ecx, DWORD PTR [esi+40]
pop ebp
pop esi
mov DWORD PTR [edi+68], ecx
; 261 :
; 262 : return true;
mov al, 1
pop ebx
; 263 : }
add esp, 56 ; 00000038H
ret 0
$L134657:
pop esi
; 190 : return false;
xor al, al
pop ebx
; 263 : }
add esp, 56 ; 00000038H
ret 0
用了SSE并行:
; COMDAT ?intersectEye@RayTriangle@@QAE_NPAVe_Ray@@PAVe_RayState@@@Z
_TEXT SEGMENT
_t$ = -72 ; size = 4
_dot_nd$ = -68 ; size = 4
tv678 = -64 ; size = 16
tv657 = -64 ; size = 16
tv652 = -64 ; size = 16
_hit$ = -64 ; size = 12
tv685 = -48 ; size = 16
tv635 = -48 ; size = 16
tv627 = -48 ; size = 16
_bary$ = -48 ; size = 12
_tmp2$ = -48 ; size = 16
_tmp1$ = -48 ; size = 16
_vec$ = -48 ; size = 16
_ray_dir$ = -32 ; size = 16
_ray_src$ = -16 ; size = 16
?intersectEye@RayTriangle@@QAE_NPAVe_Ray@@PAVe_RayState@@@Z PROC NEAR ; RayTriangle::intersectEye, COMDAT
; _this$ = ecx
; _ray$ = eax
; _state$ = edi
; 146 : {
push ebp
mov ebp, esp
and esp, -16 ; fffffff0H
sub esp, 72 ; 00000048H
; 147 : float dot_nd, t, t_near, t_far, near_clip, far_clip;
; 148 : Vector4f ray_dir, ray_src, vec;
; 149 :
; 150 : prefetch( (char const *)(this + 1) );
; 151 :
; 152 : set_ps( ray_dir, ray->dir, 0.0f );
xorps xmm2, xmm2 // xmm2 = 0
push ebx //
mov ebx, eax // ebx = eax
movss xmm0, DWORD PTR [ebx+32] // xmm0 = ray->dir.z
; 153 : mul_ps( vec, normal, ray_dir );
; 154 : dot_nd = - ( vec.x + vec.y + vec.z );
; 155 :
; 156 : set_ps( ray_src, ray->src, 1.0f );
movss xmm4, DWORD PTR [ebx+8] // xmm4 = ray->src.z
movss DWORD PTR tv685[esp+84], xmm0 //
movss xmm0, DWORD PTR [ebx+28] // xmm0 = ray->dir.y
movss DWORD PTR tv685[esp+80], xmm0 //
movss xmm0, DWORD PTR [ebx+24] // xmm0 = ray->dir.x
push esi //
mov esi, ecx // esi = this
movaps xmm3, XMMWORD PTR [esi] // xmm3 = normal
prefetchnta BYTE PTR [esi+48] //
movss DWORD PTR tv685[esp+80], xmm0 //
movss DWORD PTR tv685[esp+92], xmm2 //
movaps xmm0, XMMWORD PTR tv685[esp+80] // xmm0 = ray->dir
movaps xmm1, xmm3 // xmm1 = xmm3
mulps xmm1, xmm0 // xmm1 *= xmm3
movaps XMMWORD PTR _vec$[esp+80], xmm1 // vec = xmm1
movss xmm1, DWORD PTR _vec$[esp+88] // xmm1 = vec.z
addss xmm1, DWORD PTR _vec$[esp+84] // xmm1 += vec.y
addss xmm1, DWORD PTR _vec$[esp+80] // xmm1 += vec.x
movss DWORD PTR tv678[esp+88], xmm4 //
movss xmm4, DWORD PTR [ebx+4] // xmm4 = ray->src.y
movaps XMMWORD PTR _ray_dir$[esp+80], xmm0 // ray_dir = xmm0
movaps xmm0, xmm2 // xmm0 = xmm2
subss xmm0, xmm1 // xmm0 -= xmm1
; 157 :
; 158 : if( dot_nd <= 0.0f )
comiss xmm2, xmm0 //
movss xmm1, DWORD PTR __real@3f800000 // xmm1 = 1
movss DWORD PTR tv678[esp+84], xmm4 //
movss xmm4, DWORD PTR [ebx] // xmm4 = ray->src.x
movss DWORD PTR tv678[esp+92], xmm1 // ray_src.w = xmm1
movss DWORD PTR tv678[esp+80], xmm4 // ray_src.x = xmm4
movaps xmm4, XMMWORD PTR tv678[esp+80] // xmm4 = ray_src
movss DWORD PTR _dot_nd$[esp+80], xmm0 // dot_nd = xmm0
movaps XMMWORD PTR _ray_src$[esp+80], xmm4 // ray_src = xmm4
; 159 : return false;
jae $L134660 //
; 160 :
; 161 : mul_ps( vec, normal, ray_src );
mulps xmm3, xmm4 // xmm3 *= xmm4
; 162 : t = vec.x + vec.y + vec.z + vec.w;
; 163 :
; 164 : t_near = ray->t_near;
movss xmm4, DWORD PTR [ebx+76] // xmm4 = ray->t_near
movaps XMMWORD PTR _vec$[esp+80], xmm3 // vec = xmm3
movss xmm3, DWORD PTR _vec$[esp+92] // xmm3 = vec.w
addss xmm3, DWORD PTR _vec$[esp+88] // xmm3 += vec.z
addss xmm3, DWORD PTR _vec$[esp+84] // xmm3 += vec.y
addss xmm3, DWORD PTR _vec$[esp+80] // xmm3 += vec.x
; 165 :
; 166 : if( t <= t_near * dot_nd )
mulss xmm4, xmm0 // xmm4 *= xmm0
comiss xmm4, xmm3 //
movss DWORD PTR _t$[esp+80], xmm3 // t = xmm3
; 167 : return false;
jae $L134660 //
; 168 :
; 169 : t_far = MIN( ray->t_far, state->t );
movss xmm4, DWORD PTR [edi+84] // xmm4 = state->t
comiss xmm4, DWORD PTR [ebx+80] //
jbe SHORT $L169400 //
movss xmm4, DWORD PTR [ebx+80] // xmm4 = ray->t_far
jmp SHORT $L169401 //
$L169400:
movss xmm4, DWORD PTR [edi+84] // xmm4 = state->t
$L169401:
; 170 :
; 171 : if( t >= t_far * dot_nd )
mulss xmm4, xmm0 // xmm4 *= xmm0
comiss xmm3, xmm4 //
; 172 : return false;
jae $L134660 //
; 173 :
; 174 : // make sure the intersection in the bound box...
; 175 :
; 176 : Vector3f hit;
; 177 : Vector4f tmp1, tmp2;
; 178 :
; 179 : set_ps( tmp1, ray_src.arr[ projX ], ray_src.arr[ projY ],
; 180 : ray_dir.arr[ projX ], ray_dir.arr[ projY ] );
movzx eax, BYTE PTR [esi+45] // eax = projY
movzx ecx, BYTE PTR [esi+44] // ecx = projX
add eax, eax // eax *= 4
add eax, eax
movss xmm4, DWORD PTR _ray_dir$[esp+eax+80] // xmm4 = ray_dir.arr[ projY ]
movss xmm6, DWORD PTR _ray_src$[esp+eax+80] // xmm6 = ray_src.arr[ projY ]
; 181 : set_ps( tmp2, dot_nd, dot_nd, t, t );
; 182 : mul_ps( vec, tmp1, tmp2 );
; 183 :
; 184 : vec.x = hit.arr[ projX ] = vec.x + vec.z;
movzx eax, BYTE PTR [esi+44] // eax = projX
movss DWORD PTR tv657[esp+92], xmm3 // tmp1.w = t
movss DWORD PTR tv657[esp+88], xmm3 // tmp1.z = t
movss DWORD PTR tv657[esp+84], xmm0 // tmp1.y = dot_nd
movss DWORD PTR tv657[esp+80], xmm0 // tmp1.x = dot_nd
movaps xmm7, XMMWORD PTR tv657[esp+80] // xmm7 = tmp1
movss DWORD PTR tv652[esp+92], xmm4 // tmp2.w = xmm4
add ecx, ecx // ecx *= 4
add ecx, ecx
; 185 : vec.y = hit.arr[ projY ] = vec.y + vec.w;
; 186 : vec.z = dot_nd;
; 187 : vec.w = 0.0f;
; 188 :
; 189 : // is the intersection in our triangular area...
; 190 :
; 191 : if( inited != TRUE )
cmp BYTE PTR [esi+47], 1 //
movss xmm4, DWORD PTR _ray_src$[esp+ecx+80] // xmm4 = ray_src.arr[ projX ]
movss xmm5, DWORD PTR _ray_dir$[esp+ecx+80] // xmm5 = ray_dir.arr[ projX ]
movzx ecx, BYTE PTR [esi+45] // ecx = projY
movss DWORD PTR tv652[esp+80], xmm4 // tmp2.x = xmm4
movss DWORD PTR tv652[esp+88], xmm5 // tmp2.z = xmm5
movss DWORD PTR tv652[esp+84], xmm6 // tmp2.y = xmm6
movaps xmm4, XMMWORD PTR tv652[esp+80] // xmm4 = tmp2
mulps xmm4, xmm7 // xmm4 *= xmm7
movaps XMMWORD PTR _vec$[esp+80], xmm4 // vec = xmm4
movss xmm4, DWORD PTR _vec$[esp+88] // xmm4 = vec.z
addss xmm4, DWORD PTR _vec$[esp+80] // xmm4 += vec.x
movss DWORD PTR _hit$[esp+eax*4+80], xmm4 // hit.arr[ projX ] = xmm4
movss DWORD PTR _vec$[esp+80], xmm4 // vec.x = xmm4
movss xmm4, DWORD PTR _vec$[esp+92] // xmm4 = vec.w
addss xmm4, DWORD PTR _vec$[esp+84] // xmm4 += vec.y
movss DWORD PTR _hit$[esp+ecx*4+80], xmm4 // hit.arr[ projY ] = xmm4
movss DWORD PTR _vec$[esp+84], xmm4 // vec.y = xmm4
movss DWORD PTR _vec$[esp+88], xmm0 // vec.z = dot_nd
movss DWORD PTR _vec$[esp+92], xmm2 // vec.w = 0
je SHORT $L134657 //
; 192 : prepare_intersect();
call ?prepare_intersect@RayTriangle@@QAEXXZ ; RayTriangle::prepare_intersect
movss xmm3, DWORD PTR _t$[esp+80] // xmm3 = t
movss xmm0, DWORD PTR _dot_nd$[esp+80] // xmm0 = dot_nd
movss xmm1, DWORD PTR __real@3f800000 // xmm1 = 1
xorps xmm2, xmm2 // xmm2 = 0
$L134657:
; 193 :
; 194 : Vector3f bary;
; 195 :
; 196 : set_ps( tmp1, la, 0.0f );
; 197 : mul_ps( tmp1, vec );
movaps xmm4, XMMWORD PTR _vec$[esp+80] // xmm4 = vec
movss xmm5, DWORD PTR [esi+24] // xmm5 = la.z
movss DWORD PTR tv635[esp+88], xmm5 // tmp1.z = xmm5
movss xmm5, DWORD PTR [esi+20] // xmm5 = la.y
movss DWORD PTR tv635[esp+84], xmm5 // tmp1.y = xmm5
movss xmm5, DWORD PTR [esi+16] // xmm5 = la.x
movss DWORD PTR tv635[esp+80], xmm5 // tmp1.x = xmm5
movss DWORD PTR tv635[esp+92], xmm2 // tmp1.w = xmm2
movaps xmm5, XMMWORD PTR tv635[esp+80] // xmm5 = tmp1
mulps xmm5, xmm4 // xmm5 *= xmm4
movaps XMMWORD PTR _tmp1$[esp+80], xmm5 // tmp1 = xmm5
; 198 :
; 199 : bary.x = tmp1.x + tmp1.y + tmp1.z;
movss xmm7, DWORD PTR _tmp1$[esp+88] // xmm7 = tmp1.z
addss xmm7, DWORD PTR _tmp1$[esp+84] // xmm7 += tmp1.y
addss xmm7, DWORD PTR _tmp1$[esp+80] // xmm7 += tmp1.x
; 200 :
; 201 : if( bary.x < 0.0f || bary.x > dot_nd )
comiss xmm2, xmm7 //
ja $L134660 //
comiss xmm7, xmm0 //
ja $L134660 //
; 202 : return false;
; 203 :
; 204 : set_ps( tmp2, lb, 0.0f );
movss xmm5, DWORD PTR [esi+36] // xmm5 = lb.z
movss DWORD PTR tv627[esp+88], xmm5 // tmp2.z = xmm5
movss xmm5, DWORD PTR [esi+32] // xmm5 = lb.y
movss DWORD PTR tv627[esp+84], xmm5 // tmp2.y = xmm5
movss xmm5, DWORD PTR [esi+28] // xmm5 = lb.x
movss DWORD PTR tv627[esp+80], xmm5 // tmp2.x = xmm5
movss DWORD PTR tv627[esp+92], xmm2 // tmp2.w = xmm2
movaps xmm5, XMMWORD PTR tv627[esp+80] // xmm5 = tmp2
; 205 : mul_ps( tmp2, vec );
mulps xmm5, xmm4 // xmm5 *= xmm4
movaps XMMWORD PTR _tmp2$[esp+80], xmm5 // tmp2 = xmm5
; 206 :
; 207 : bary.y = tmp2.x + tmp2.y + tmp2.z;
movss xmm6, DWORD PTR _tmp2$[esp+88] // xmm6 = tmp2.z
addss xmm6, DWORD PTR _tmp2$[esp+84] // xmm6 += tmp2.y
addss xmm6, DWORD PTR _tmp2$[esp+80] // xmm6 += tmp2.x
; 208 :
; 209 : if( bary.y < 0.0f || bary.y > dot_nd )
comiss xmm2, xmm6 //
ja $L134660 //
comiss xmm6, xmm0 //
ja $L134660 //
; 210 : return false;
; 211 :
; 212 : bary.z = dot_nd - bary.x - bary.y;
movaps xmm5, xmm0 // xmm5 = xmm0
subss xmm5, xmm7 // xmm5 -= xmm7
subss xmm5, xmm6 // xmm5 -= xmm6
; 213 :
; 214 : if( bary.z < 0.0f || bary.z > dot_nd )
comiss xmm2, xmm5 //
ja $L134660 //
comiss xmm5, xmm0 //
ja $L134660 //
; 215 : return false;
; 216 :
; 217 : float r_dot_nd = 1.0f / dot_nd;
; 218 :
; 219 : t *= r_dot_nd;
; 220 : bary.x *= r_dot_nd;
; 221 : bary.y *= r_dot_nd;
; 222 : bary.z *= r_dot_nd;
; 223 : hit.arr[ projX ] *= r_dot_nd;
movzx eax, BYTE PTR [esi+44]
divss xmm1, xmm0
movaps xmm4, xmm1
mulss xmm4, xmm3
movaps xmm3, xmm1
mulss xmm3, xmm7
movss DWORD PTR _bary$[esp+80], xmm3
movaps xmm3, xmm1
mulss xmm3, xmm6
movss DWORD PTR _bary$[esp+84], xmm3
lea eax, DWORD PTR _hit$[esp+eax*4+80]
movaps xmm3, xmm1
mulss xmm3, xmm5
movss DWORD PTR _bary$[esp+88], xmm3
movss xmm3, DWORD PTR [eax]
mulss xmm3, xmm1
movss DWORD PTR [eax], xmm3
; 224 : hit.arr[ projY ] *= r_dot_nd;
movzx eax, BYTE PTR [esi+45]
movss xmm3, DWORD PTR _hit$[esp+eax*4+80]
lea eax, DWORD PTR _hit$[esp+eax*4+80]
mulss xmm3, xmm1
movss DWORD PTR [eax], xmm3
; 225 : dot_nd = - dot_nd;
; 226 :
; 227 : near_clip = g_Rend.scene->opt.view.clip.min;
mov eax, DWORD PTR ?g_Rend@@3Ve_Renderer@@A+780
movss xmm1, DWORD PTR [eax+32]
; 228 :
; 229 : float dist = ray->dad * t;
; 230 :
; 231 : far_clip = g_Rend.scene->opt.view.clip.max;
movss xmm3, DWORD PTR [eax+36]
subss xmm2, xmm0
movss xmm0, DWORD PTR [ebx+68]
mulss xmm0, xmm4
; 232 :
; 233 : if( dist < near_clip || dist > far_clip )
comiss xmm1, xmm0
ja $L134660
comiss xmm0, xmm3
ja $L134660
; 234 : return false;
; 235 :
; 236 : hit.arr[ projZ ] = ray_src.arr[ projZ ] + ray_dir.arr[ projZ ] * t;
movzx eax, BYTE PTR [esi+46]
; 237 :
; 238 : state->normBary = bary;
mov ecx, DWORD PTR _bary$[esp+84]
add eax, eax
add eax, eax
movss xmm0, DWORD PTR _ray_dir$[esp+eax+80]
mulss xmm0, xmm4
addss xmm0, DWORD PTR _ray_src$[esp+eax+80]
movss DWORD PTR _hit$[esp+eax+80], xmm0
mov eax, DWORD PTR _bary$[esp+80]
lea edx, DWORD PTR [edi+200]
mov DWORD PTR [edx], eax
mov DWORD PTR [edx+4], ecx
mov ecx, DWORD PTR _bary$[esp+88]
mov DWORD PTR [edx+8], ecx
; 239 :
; 240 : //if( dpTri ) { // it's generated by displacement...
; 241 :
; 242 : /*float bx, by, bz;
; 243 : bx = bary.x;
; 244 : by = bary.y;
; 245 : bz = bary.z;
; 246 : Vector3f **barys = dpTri->barys;
; 247 : bary.x = bx * barys[0]->x + by * barys[1]->x + bz * barys[2]->x;
; 248 : bary.y = bx * barys[0]->y + by * barys[1]->y + bz * barys[2]->y;
; 249 : bary.z = bx * barys[0]->z + by * barys[1]->z + bz * barys[2]->z;
; 250 :
; 251 : state->vxNormals[0] = dpTri->v1->normal;
; 252 : state->vxNormals[1] = dpTri->v2->normal;
; 253 : state->vxNormals[2] = dpTri->v3->normal;*/
; 254 :
; 255 : //} else {
; 256 :
; 257 : state->vxNormals[0] = tri->v1->normal;
mov edx, DWORD PTR [esi+40]
add edx, 12 ; 0000000cH
mov ebx, DWORD PTR [edx]
lea ecx, DWORD PTR [edi+260]
mov DWORD PTR [ecx], ebx
mov ebx, DWORD PTR [edx+4]
mov DWORD PTR [ecx+4], ebx
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ecx+8], edx
; 258 : state->vxNormals[1] = tri->v2->normal;
mov ecx, DWORD PTR [esi+40]
add ecx, 36 ; 00000024H
mov ebx, DWORD PTR [ecx]
lea edx, DWORD PTR [edi+272]
mov DWORD PTR [edx], ebx
mov ebx, DWORD PTR [ecx+4]
mov DWORD PTR [edx+4], ebx
mov ecx, DWORD PTR [ecx+8]
mov DWORD PTR [edx+8], ecx
; 259 : state->vxNormals[2] = tri->v3->normal;
mov edx, DWORD PTR [esi+40]
add edx, 60 ; 0000003cH
mov ebx, DWORD PTR [edx]
lea ecx, DWORD PTR [edi+284]
mov DWORD PTR [ecx], ebx
mov ebx, DWORD PTR [edx+4]
mov DWORD PTR [ecx+4], ebx
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ecx+8], edx
; 260 : //}
; 261 :
; 262 : // get the nearest hit point...
; 263 :
; 264 : state->P = hit/*.abc*/;
mov edx, DWORD PTR _hit$[esp+80]
lea ecx, DWORD PTR [edi+128]
mov DWORD PTR [ecx], edx
mov edx, DWORD PTR _hit$[esp+84]
mov DWORD PTR [ecx+4], edx
mov edx, DWORD PTR _hit$[esp+88]
mov DWORD PTR [ecx+8], edx
; 265 : state->t = t;
; 266 :
; 267 : state->dotNd = dot_nd;
; 268 : state->bary = bary;
mov edx, DWORD PTR _bary$[esp+84]
movss DWORD PTR [edi+84], xmm4
movss DWORD PTR [edi+224], xmm2
lea ecx, DWORD PTR [edi+188]
mov DWORD PTR [ecx], eax
mov eax, DWORD PTR _bary$[esp+88]
mov DWORD PTR [ecx+4], edx
mov DWORD PTR [ecx+8], eax
; 269 :
; 270 : state->Ng = normal.abc;
mov edx, esi
mov eax, DWORD PTR [edx]
lea ecx, DWORD PTR [edi+140]
mov DWORD PTR [ecx], eax
mov eax, DWORD PTR [edx+4]
mov DWORD PTR [ecx+4], eax
mov edx, DWORD PTR [edx+8]
mov DWORD PTR [ecx+8], edx
; 271 : state->N = normal.abc;
mov ecx, esi
mov edx, DWORD PTR [ecx]
lea eax, DWORD PTR [edi+152]
mov DWORD PTR [eax], edx
mov edx, DWORD PTR [ecx+4]
mov DWORD PTR [eax+4], edx
mov ecx, DWORD PTR [ecx+8]
mov DWORD PTR [eax+8], ecx
; 272 :
; 273 : // for 3ds max...
; 274 : state->faceNum = tri->faceNum;
mov edx, DWORD PTR [esi+40]
mov eax, DWORD PTR [edx+144]
mov DWORD PTR [edi+256], eax
; 275 : state->hitPri = tri;
mov ecx, DWORD PTR [esi+40]
mov DWORD PTR [edi+68], ecx
; 276 :
; 277 : return true;
mov al, 1
; 278 : }
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
$L134660:
pop esi
xor al, al
pop ebx
mov esp, ebp
pop ebp
ret 0
看来要在光线追踪这样的程序中最大的发挥SSE的性能,还是要用手工汇编。。。
就像mayax所做的那样。。。
否则还是不要用SSE比较明智。。。
当然啦,如果改变算法,我们就可以考虑intel的Interactive Ray-tracing中的方法。。。就是并行的追踪四条光线。。。。我自己写过这样的算法,确实可以快一半左右。。。。而且我还是用SSE intrinsic写的。。。如果用汇编写,速度不敢想象。。。当然编写的痛苦程度也不敢想象。。。