zoukankan      html  css  js  c++  java
  • 原创:各种normalize函数实现的性能和精度大比拼

    /////////////////////////////////////////////////////////////////////////
    //
    // Performance benchmarking program for various normalize functions
    //
    // by Elvic Liang
    //
    /////////////////////////////////////////////////
    
    #include <math.h>
    #include <xmmintrin.h>
    #include <time.h>
    
    struct Vector
    {
    	float x, y, z;
    
    	inline Vector() {}
    
    	inline Vector(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
    
    	inline Vector operator * (float rhs) const
    	{
    		Vector temp;
    		temp.x = x * rhs;
    		temp.y = y * rhs;
    		temp.z = z * rhs;
    		return temp;
    	}
    };
    
    template <typename T>
    inline T max(T a, T b)
    {
    	return ((a > b) ? a : b);
    }
    
    inline float rcpf(float x)
    {
    #ifdef _MSC_VER
    	return 1.0f / x;
    #else
    	const __m128 a = _mm_set_ss(x);
    	const __m128 r = _mm_rcp_ss(a);
    	// one more iteration
    	return _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(r, r), _mm_mul_ss(_mm_mul_ss(r, r), a)));
    #endif
    }
    
    inline float invsqrtf(float x)
    {
    	const __m128 a = _mm_max_ss(_mm_set_ss(x), _mm_set_ss(1.0e-30f));
    	const __m128 r = _mm_rsqrt_ss(a);
    	// one more iteration
    	return _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), 
    		_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
    }
    
    inline float fastinvsqrt(float x)
    {
    	float xhalf = 0.5f * x;
    	int i = *(int *)&x;
    	i = 0x5f3759df - (i >> 1);
    	x = *(float *)&i;
    	x = x * (1.5f - xhalf * x * x);
    	return x;
    }
    
    inline float fastsqrt(float x)
    {
    	union {
    		int intPart;
    		float floatPart;
    	} convertor;
    	union {
    		int intPart;
    		float floatPart;
    	} convertor2;
    	convertor.floatPart = x;
    	convertor2.floatPart = x;
    	convertor.intPart = 0x1fbcf800 + (convertor.intPart >> 1);
    	convertor2.intPart = 0x5f3759df - (convertor2.intPart >> 1);
    	return 0.5f * (convertor.floatPart + (x * convertor2.floatPart));
    }
    
    inline float dot(const Vector & a, const Vector & b)
    {
    	return (a.x * b.x + a.y * b.y + a.z * b.z);
    }
    
    inline float len(const Vector & a)
    {
    	const float l = dot(a, a);
    	return sqrtf(max(0.0f, l));
    }
    
    inline Vector normalize_ref(const Vector & a)
    {
    	float length = sqrtf(max(0.0f, a.x * a.x + a.y * a.y + a.z * a.z));
    	// Using division gives higher precision than multiplying (1/length)
    	return Vector(a.x / length, a.y / length, a.z / length);
    }
    
    inline Vector normalize(const Vector & a)
    {
    	return a * invsqrtf(dot(a, a));
    }
    
    inline Vector normalize_v1(const Vector & a)
    {
    	const __m128 pa = _mm_max_ss(_mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z), _mm_set_ss(1.0e-30f));
    	const __m128 r = _mm_rsqrt_ss(pa);
    	// one more iteration
    	const float d = _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), 
    		_mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
    	return a * d;
    }
    
    inline Vector normalize_v2(const Vector & a)
    {
    	return a * fastinvsqrt(dot(a, a));
    }
    
    inline Vector normalize_v3(const Vector & a)
    {
    	// TODO: Use SSE 4.2 dot product intrinsic when available
    	const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x);
    	const __m128 s = _mm_mul_ps(x, x);
        const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s));
    	const __m128 pa = _mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f));
    	const __m128 r = _mm_rsqrt_ss(pa);
    	// one more iteration
    	return a * _mm_cvtss_f32(_mm_mul_ss(r, _mm_add_ss(_mm_set_ss(1.5f), 
    		_mm_mul_ss(_mm_mul_ss(pa, _mm_set_ss(-0.5f)), _mm_mul_ss(r, r)))));
    }
    
    inline float normalize_len(Vector & r, const Vector & a)
    {
    	const float l = len(a);
    	const float d = max(l, 1.0e-30f);
    	r = a * rcpf(d);
    	return d;
    }
    
    inline float normalize_len_v1(Vector & r, const Vector & a)
    {
    	const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
    	r = a * rcpf(d);
    	return d;
    }
    
    inline float normalize_len_v2(Vector & r, const Vector & a)
    {
    	const float d = sqrtf(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
    	const __m128 pa = _mm_set_ss(d);
    	const __m128 pr = _mm_rcp_ss(pa);
    	// one more iteration
    	const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
    	r = a * rd;
    	return d;
    }
    
    inline float normalize_len_v3(Vector & r, const Vector & a)
    {
    	const __m128 pa = _mm_sqrt_ss(_mm_max_ss(_mm_set_ss(1.0e-30f), 
    		_mm_set_ss(a.x * a.x + a.y * a.y + a.z * a.z)));
    	const __m128 pr = _mm_rcp_ss(pa);
    	// one more iteration
    	const float rd = _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
    	r = a * rd;
    	return _mm_cvtss_f32(pa);
    }
    
    inline float normalize_len_v4(Vector & r, const Vector & a)
    {
    	const float d = fastsqrt(max(1.0e-30f, a.x * a.x + a.y * a.y + a.z * a.z));
    	r = a * rcpf(d);
    	return d;
    }
    
    inline float normalize_len_v5(Vector & r, const Vector & a)
    {
    	// TODO: Use SSE 4.2 dot product intrinsic when available
    	const __m128 x = _mm_set_ps(1.0f, a.z, a.y, a.x);
    	const __m128 s = _mm_mul_ps(x, x);
        const __m128 t = _mm_add_ss(s, _mm_movehl_ps(s, s));
    	const __m128 pa = _mm_sqrt_ss(
    		_mm_max_ss(_mm_add_ss(t, _mm_shuffle_ps(t, t, 1)), _mm_set_ss(1.0e-30f)));
    	const __m128 pr = _mm_rcp_ss(pa);
    	// one more iteration
    	r = a * _mm_cvtss_f32(_mm_sub_ss(_mm_add_ss(pr, pr), _mm_mul_ss(_mm_mul_ss(pr, pr), pa)));
    	return _mm_cvtss_f32(pa);
    }
    
    struct Random
    {
    	unsigned int state;
    
    	inline Random(unsigned int seed = 0x9e3779b1)
    	{
    		state = hash(seed);
    	}
    
    	inline unsigned int hash(unsigned int a)
    	{
    		a = (a+0x7ed55d16) + (a<<12);
    		a = (a^0xc761c23c) ^ (a>>19);
    		a = (a+0x165667b1) + (a<<5);
    		a = (a+0xd3a2646c) ^ (a<<9);
    		a = (a+0xfd7046c5) + (a<<3);
    		a = (a^0xb55a4f09) ^ (a>>16);
    		return a;
    	}
    
    	inline float next_float()
    	{
    		state = hash(state);
    		return (state & 0xFFFFFF) * (1.0f / float(1 << 24));
    	}
    
    	inline float next()
    	{
    		return (next_float() * 1000.0f - 500.0f);
    	}
    };
    
    int get_time()
    {
    	return (int)clock();
    }
    
    int main(int argc, char* argv[])
    {
    	const int NTEST = 100000000;
    
    	int rand_time = 0;
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			sum += (v.x + v.y + v.z);
    		}
    		rand_time = get_time() - start_time;
    		printf("random:          sum = %f    time = %d
    ", sum, rand_time);
    	}
    
    	printf("testing performance...
    ");
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r = normalize_ref(v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_ref (reference):       sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r = normalize(v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize:       sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r = normalize_v1(v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_v1:       sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r = normalize_v2(v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_v2 (fast):       sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r = normalize_v3(v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_v3:       sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len:   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len_v1(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len_v1:   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len_v2(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len_v2:   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len_v3(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len_v3:   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len_v4(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len_v4 (fast):   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	{
    		int start_time = get_time();
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			normalize_len_v5(r, v);
    			sum += (r.x + r.y + r.z);
    		}
    		int done_time = get_time() - start_time - rand_time;
    		printf("normalize_len_v5:   sum = %f    time = %d
    ", sum, done_time);
    	}
    
    	printf("testing precision...
    ");
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			r2 = normalize_v1(v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_v1:   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			r2 = normalize_v2(v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_v2 (fast):   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			r2 = normalize_v3(v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_v3:   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			normalize_len_v3(r2, v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_len_v3:   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			normalize_len_v4(r2, v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_len_v4 (fast):   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	{
    		float max_error = 0.0f;
    		Random random;
    		double sum = 0.0;
    		for (int i = 0; i < NTEST; ++i)
    		{
    			Vector v, r1, r2;
    			v.x = random.next();
    			v.y = random.next();
    			v.z = random.next();
    			r1 = normalize_ref(v);
    			normalize_len_v5(r2, v);
    			sum += (r1.x + r1.y + r1.z + r2.x + r2.y + r2.z);
    			max_error = max(fabsf(r1.x - r2.x), max(fabsf(r1.y - r2.y), fabsf(r1.z - r2.z)));
    		}
    		printf("normalize_len_v5:   sum = %f    max. error = %.17f
    ", sum, max_error);
    	}
    
    	return 0;
    }
    

    输出数据:

    random: sum = 296444.360077 time = 2667
    testing performance...
    normalize_ref (reference): sum = 3368.279413 time = 2528
    normalize: sum = 3368.280312 time = 1451
    normalize_v1: sum = 3368.280312 time = 1530
    normalize_v2 (fast): sum = 3360.893364 time = 671
    normalize_v3: sum = 3368.279959 time = 1405
    normalize_len: sum = 3368.279241 time = 2465
    normalize_len_v1: sum = 3368.279241 time = 2200
    normalize_len_v2: sum = 3368.280580 time = 2481
    normalize_len_v3: sum = 3368.280580 time = 1717
    normalize_len_v4 (fast): sum = 3399.301634 time = 2044
    normalize_len_v5: sum = 3368.280429 time = 1514
    testing precision...
    normalize_v1: sum = 6736.559725 max. error = 0.00000005960464478
    normalize_v2 (fast): sum = 6729.172777 max. error = 0.00128239393234253
    normalize_v3: sum = 6736.559372 max. error = 0.00000011920928955
    normalize_len_v3: sum = 6736.559993 max. error = 0.00000005960464478
    normalize_len_v4 (fast): sum = 6767.581047 max. error = 0.01472616195678711

    normalize_len_v5: sum = 6736.559842 max. error = 0.00000005960464478

    最后结论:

    normalize_v3 性价比最高。如果你在3D游戏、或者其它交互性要求很高的场合使用normalize,可以考虑使用这个快速实现,只需要平台支持SSE 2.0即可。如果你对精度要求不高,可以考虑使用normalize_v2 (fast)版本的实现,它使用了Quake 3游戏引擎中的快速开平方根算法。

  • 相关阅读:
    实例 find
    实例 历史命令查找
    Crontab
    find命令
    实例 tar备份以日期命名
    断开网络驱动器后图标不消失
    Windows7系统下优化固态硬盘
    目标进程已退出,但未引发 CoreCLR 启动事件
    md5 helper
    List<T> or IList<T>
  • 原文地址:https://www.cnblogs.com/len3d/p/4641683.html
Copyright © 2011-2022 走看看