diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index bb85c12f..8146b7d4 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -282,36 +282,10 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the reciprocal square root of the input. ////////////////////////////////////////////////////////////////////////// - #if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS) - // HACK!!! Visual Studio 2019 has a code generation bug triggered by the code below, disable optimizations for now - // Bug only happens with x64 SSE2, not with AVX nor with x86 - // Fixed in 16.5.4, see https://github.com/nfrechette/rtm/issues/35 - // TODO: Remove this hack sometime in 2022 or later once the fix is old enough that we no longer have to support the hack - #pragma optimize("", off) - #endif RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL scalar_sqrt_reciprocal(scalarf_arg0 input) RTM_NO_EXCEPT { - // Perform two passes of Newton-Raphson iteration on the hardware estimate - const __m128 half = _mm_set_ss(0.5F); - const __m128 input_half = _mm_mul_ss(input.value, half); - const __m128 x0 = _mm_rsqrt_ss(input.value); - - // First iteration - __m128 x1 = _mm_mul_ss(x0, x0); - x1 = _mm_sub_ss(half, _mm_mul_ss(input_half, x1)); - x1 = _mm_add_ss(_mm_mul_ss(x0, x1), x0); - - // Second iteration - __m128 x2 = _mm_mul_ss(x1, x1); - x2 = _mm_sub_ss(half, _mm_mul_ss(input_half, x2)); - x2 = _mm_add_ss(_mm_mul_ss(x1, x2), x1); - - return scalarf{ x2 }; + return scalarf{ _mm_div_ss(_mm_set_ps1(1.0F), _mm_sqrt_ss(input.value)) }; } - #if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS) - // HACK!!! See comment above - #pragma optimize("", on) - #endif #endif ////////////////////////////////////////////////////////////////////////// @@ -319,11 +293,12 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_sqrt_reciprocal(float input) RTM_NO_EXCEPT { -#if defined(RTM_SSE2_INTRINSICS) - return scalar_cast(scalar_sqrt_reciprocal(scalar_set(input))); -#else + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. return 1.0F / scalar_sqrt(input); -#endif } #if defined(RTM_SSE2_INTRINSICS) @@ -341,11 +316,12 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_reciprocal(float input) RTM_NO_EXCEPT { -#if defined(RTM_SSE2_INTRINSICS) - return scalar_cast(scalar_reciprocal(scalar_set(input))); -#else - return 1.0f / input; -#endif + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. + return 1.0F / input; } #if defined(RTM_SSE2_INTRINSICS) @@ -1291,8 +1267,9 @@ namespace rtm __m128 abs_value = _mm_and_ps(value.value, _mm_castsi128_ps(abs_mask)); // Compute our value - __m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, _mm_set_ps1(1.0F)); - __m128 reciprocal = scalar_reciprocal(scalarf{ abs_value }).value; + const __m128 one = _mm_set_ps1(1.0F); + __m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, one); + __m128 reciprocal = _mm_div_ss(one, abs_value); __m128 x = RTM_VECTOR4F_SELECT(is_larger_than_one, reciprocal, abs_value); @@ -1375,15 +1352,18 @@ namespace rtm // If X < 0.0, we return atan(y/x) + sign(Y) * PI // See: https://en.wikipedia.org/wiki/Atan2#Definition_and_computation + const __m128 y_value = y.value; + const __m128 x_value = x.value; + const __m128 zero = _mm_setzero_ps(); - __m128 is_x_zero = _mm_cmpeq_ss(x.value, zero); - __m128 is_y_zero = _mm_cmpeq_ss(y.value, zero); + __m128 is_x_zero = _mm_cmpeq_ss(x_value, zero); + __m128 is_y_zero = _mm_cmpeq_ss(y_value, zero); __m128 inputs_are_zero = _mm_and_ps(is_x_zero, is_y_zero); - __m128 is_x_positive = _mm_cmpgt_ss(x.value, zero); + __m128 is_x_positive = _mm_cmpgt_ss(x_value, zero); const __m128 sign_mask = _mm_set_ps(-0.0F, -0.0F, -0.0F, -0.0F); - __m128 y_sign = _mm_and_ps(y.value, sign_mask); + __m128 y_sign = _mm_and_ps(y_value, sign_mask); // If X == 0.0, our offset is PI/2 otherwise it is PI both with the sign of Y __m128 half_pi = _mm_set_ps1(constants::half_pi()); @@ -1397,8 +1377,10 @@ namespace rtm // If X == 0.0 and Y == 0.0, our offset is 0.0 offset = _mm_andnot_ps(inputs_are_zero, offset); - __m128 angle = _mm_div_ss(y.value, x.value); - __m128 value = scalar_atan(scalarf{ angle }).value; + __m128 angle = _mm_div_ss(y_value, x_value); + scalarf angle_s = scalarf{ angle }; + scalarf value_s = scalar_atan(angle_s); + __m128 value = value_s.value; // If X == 0.0, our value is 0.0 otherwise it is atan(y/x) value = _mm_or_ps(_mm_and_ps(is_x_zero, zero), _mm_andnot_ps(is_x_zero, value)); diff --git a/includes/rtm/vector4d.h b/includes/rtm/vector4d.h index 52e4e32d..c6f6322d 100644 --- a/includes/rtm/vector4d.h +++ b/includes/rtm/vector4d.h @@ -977,6 +977,11 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_reciprocal(const vector4d& input) RTM_NO_EXCEPT { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. return vector_div(vector_set(1.0), input); } @@ -996,6 +1001,19 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Per component reciprocal square root of the input: 1.0 / sqrt(input) + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_sqrt_reciprocal(const vector4d& input) RTM_NO_EXCEPT + { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. + return vector_reciprocal(vector_sqrt(input)); + } + ////////////////////////////////////////////////////////////////////////// // Per component returns the smallest integer value not less than the input (round towards positive infinity). // vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0] diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index 06e68187..ac3c5cfa 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -1123,6 +1123,11 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. #if defined(RTM_SSE2_INTRINSICS) return _mm_div_ps(_mm_set_ps1(1.0F), input); #elif defined(RTM_NEON64_INTRINSICS) @@ -1160,6 +1165,19 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Per component reciprocal square root of the input: 1.0 / sqrt(input) + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_sqrt_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT + { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. + return vector_reciprocal(vector_sqrt(input)); + } + ////////////////////////////////////////////////////////////////////////// // Per component returns the smallest integer value not less than the input (round towards positive infinity). // vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0] diff --git a/tests/sources/test_vector4_impl.h b/tests/sources/test_vector4_impl.h index 6e64eebb..ed97da45 100644 --- a/tests/sources/test_vector4_impl.h +++ b/tests/sources/test_vector4_impl.h @@ -547,6 +547,11 @@ void test_vector4_arithmetic_impl(const FloatType threshold) CHECK(scalar_near_equal(vector_get_z(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[2])), threshold)); CHECK(scalar_near_equal(vector_get_w(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[3])), threshold)); + CHECK(scalar_near_equal(vector_get_x(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[0]))), threshold)); + CHECK(scalar_near_equal(vector_get_y(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[1]))), threshold)); + CHECK(scalar_near_equal(vector_get_z(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[2]))), threshold)); + CHECK(scalar_near_equal(vector_get_w(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[3]))), threshold)); + const Vector4Type neg_zero = vector_set(FloatType(-0.0)); CHECK(FloatType(vector_get_x(vector_floor(neg_zero))) == scalar_floor(FloatType(-0.0))); CHECK(FloatType(vector_get_x(vector_floor(test_value0))) == scalar_floor(test_value0_flt[0]));