From 7159d1bdc6b488ca312717be8a3c14eab908be9f Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:29:30 -0500 Subject: [PATCH 1/8] style(scalar): use capital F for floating point constant --- includes/rtm/scalarf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index bb85c12f..f04a1693 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -344,7 +344,7 @@ namespace rtm #if defined(RTM_SSE2_INTRINSICS) return scalar_cast(scalar_reciprocal(scalar_set(input))); #else - return 1.0f / input; + return 1.0F / input; #endif } From c09180fe82aa852d69800435ca258d77091245ad Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:35:42 -0500 Subject: [PATCH 2/8] docs: add performance note --- includes/rtm/scalarf.h | 5 +++++ includes/rtm/vector4d.h | 5 +++++ includes/rtm/vector4f.h | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index f04a1693..a5570820 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -344,6 +344,11 @@ namespace rtm #if defined(RTM_SSE2_INTRINSICS) return scalar_cast(scalar_reciprocal(scalar_set(input))); #else + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. return 1.0F / input; #endif } diff --git a/includes/rtm/vector4d.h b/includes/rtm/vector4d.h index 52e4e32d..b8983c46 100644 --- a/includes/rtm/vector4d.h +++ b/includes/rtm/vector4d.h @@ -977,6 +977,11 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_reciprocal(const vector4d& input) RTM_NO_EXCEPT { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. return vector_div(vector_set(1.0), input); } diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index 06e68187..2f5b5f37 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -1123,6 +1123,11 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. #if defined(RTM_SSE2_INTRINSICS) return _mm_div_ps(_mm_set_ps1(1.0F), input); #elif defined(RTM_NEON64_INTRINSICS) From 15070b78eb28699e276fa3ca5dea7259aaa1c5ff Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:36:16 -0500 Subject: [PATCH 3/8] fix(scalar): use C++ scalar code to avoid intrinsic roundtrip overhead --- includes/rtm/scalarf.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index a5570820..5c921916 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -341,16 +341,12 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_reciprocal(float input) RTM_NO_EXCEPT { -#if defined(RTM_SSE2_INTRINSICS) - return scalar_cast(scalar_reciprocal(scalar_set(input))); -#else // Performance note: // With modern out-of-order executing processors, it is typically faster to use // a full division instead of a reciprocal estimate + Newton-Raphson iterations // because the resulting code is more dense and is more likely to inline and // as it uses fewer instructions. return 1.0F / input; -#endif } #if defined(RTM_SSE2_INTRINSICS) From 4a2e61498d12bf1ca395a8f69486f54cc95650e9 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:38:37 -0500 Subject: [PATCH 4/8] fix(scalar): use div/sqrt for scalar reciprocal square root --- includes/rtm/scalarf.h | 37 ++++++------------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index 5c921916..9a46278b 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -282,36 +282,10 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// // Returns the reciprocal square root of the input. ////////////////////////////////////////////////////////////////////////// - #if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS) - // HACK!!! Visual Studio 2019 has a code generation bug triggered by the code below, disable optimizations for now - // Bug only happens with x64 SSE2, not with AVX nor with x86 - // Fixed in 16.5.4, see https://github.com/nfrechette/rtm/issues/35 - // TODO: Remove this hack sometime in 2022 or later once the fix is old enough that we no longer have to support the hack - #pragma optimize("", off) - #endif RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL scalar_sqrt_reciprocal(scalarf_arg0 input) RTM_NO_EXCEPT { - // Perform two passes of Newton-Raphson iteration on the hardware estimate - const __m128 half = _mm_set_ss(0.5F); - const __m128 input_half = _mm_mul_ss(input.value, half); - const __m128 x0 = _mm_rsqrt_ss(input.value); - - // First iteration - __m128 x1 = _mm_mul_ss(x0, x0); - x1 = _mm_sub_ss(half, _mm_mul_ss(input_half, x1)); - x1 = _mm_add_ss(_mm_mul_ss(x0, x1), x0); - - // Second iteration - __m128 x2 = _mm_mul_ss(x1, x1); - x2 = _mm_sub_ss(half, _mm_mul_ss(input_half, x2)); - x2 = _mm_add_ss(_mm_mul_ss(x1, x2), x1); - - return scalarf{ x2 }; + return scalarf{ _mm_div_ss(_mm_set_ps1(1.0F), _mm_sqrt_ss(input.value)) }; } - #if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS) - // HACK!!! See comment above - #pragma optimize("", on) - #endif #endif ////////////////////////////////////////////////////////////////////////// @@ -319,11 +293,12 @@ namespace rtm ////////////////////////////////////////////////////////////////////////// RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_sqrt_reciprocal(float input) RTM_NO_EXCEPT { -#if defined(RTM_SSE2_INTRINSICS) - return scalar_cast(scalar_sqrt_reciprocal(scalar_set(input))); -#else + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. return 1.0F / scalar_sqrt(input); -#endif } #if defined(RTM_SSE2_INTRINSICS) From 07b494e640af3d9f439a32e5017de5d555716fb0 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:40:18 -0500 Subject: [PATCH 5/8] docs(scalar): fix performance note wording --- includes/rtm/scalarf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index 9a46278b..49924d2f 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -295,7 +295,7 @@ namespace rtm { // Performance note: // With modern out-of-order executing processors, it is typically faster to use - // a full division instead of a reciprocal estimate + Newton-Raphson iterations + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations // because the resulting code is more dense and is more likely to inline and // as it uses fewer instructions. return 1.0F / scalar_sqrt(input); From 561aefdb4c3581109a2195779d7b6ccf45ce8625 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 15:40:51 -0500 Subject: [PATCH 6/8] feat(vector4): add vector_sqrt_reciprocal --- includes/rtm/vector4d.h | 13 +++++++++++++ includes/rtm/vector4f.h | 13 +++++++++++++ tests/sources/test_vector4_impl.h | 5 +++++ 3 files changed, 31 insertions(+) diff --git a/includes/rtm/vector4d.h b/includes/rtm/vector4d.h index b8983c46..c6f6322d 100644 --- a/includes/rtm/vector4d.h +++ b/includes/rtm/vector4d.h @@ -1001,6 +1001,19 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Per component reciprocal square root of the input: 1.0 / sqrt(input) + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_sqrt_reciprocal(const vector4d& input) RTM_NO_EXCEPT + { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. + return vector_reciprocal(vector_sqrt(input)); + } + ////////////////////////////////////////////////////////////////////////// // Per component returns the smallest integer value not less than the input (round towards positive infinity). // vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0] diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h index 2f5b5f37..ac3c5cfa 100644 --- a/includes/rtm/vector4f.h +++ b/includes/rtm/vector4f.h @@ -1165,6 +1165,19 @@ namespace rtm #endif } + ////////////////////////////////////////////////////////////////////////// + // Per component reciprocal square root of the input: 1.0 / sqrt(input) + ////////////////////////////////////////////////////////////////////////// + RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_sqrt_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT + { + // Performance note: + // With modern out-of-order executing processors, it is typically faster to use + // a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations + // because the resulting code is more dense and is more likely to inline and + // as it uses fewer instructions. + return vector_reciprocal(vector_sqrt(input)); + } + ////////////////////////////////////////////////////////////////////////// // Per component returns the smallest integer value not less than the input (round towards positive infinity). // vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0] diff --git a/tests/sources/test_vector4_impl.h b/tests/sources/test_vector4_impl.h index 6e64eebb..ed97da45 100644 --- a/tests/sources/test_vector4_impl.h +++ b/tests/sources/test_vector4_impl.h @@ -547,6 +547,11 @@ void test_vector4_arithmetic_impl(const FloatType threshold) CHECK(scalar_near_equal(vector_get_z(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[2])), threshold)); CHECK(scalar_near_equal(vector_get_w(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[3])), threshold)); + CHECK(scalar_near_equal(vector_get_x(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[0]))), threshold)); + CHECK(scalar_near_equal(vector_get_y(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[1]))), threshold)); + CHECK(scalar_near_equal(vector_get_z(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[2]))), threshold)); + CHECK(scalar_near_equal(vector_get_w(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[3]))), threshold)); + const Vector4Type neg_zero = vector_set(FloatType(-0.0)); CHECK(FloatType(vector_get_x(vector_floor(neg_zero))) == scalar_floor(FloatType(-0.0))); CHECK(FloatType(vector_get_x(vector_floor(test_value0))) == scalar_floor(test_value0_flt[0])); From add676c3fae31dae0758a515752e9a1427a765a5 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 22:34:00 -0500 Subject: [PATCH 7/8] fix(scalar): inline scalar_reciprocal to avoid GCC 6 codegen bug --- includes/rtm/scalarf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index 49924d2f..794e93d7 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -1267,8 +1267,9 @@ namespace rtm __m128 abs_value = _mm_and_ps(value.value, _mm_castsi128_ps(abs_mask)); // Compute our value - __m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, _mm_set_ps1(1.0F)); - __m128 reciprocal = scalar_reciprocal(scalarf{ abs_value }).value; + const __m128 one = _mm_set_ps1(1.0F); + __m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, one); + __m128 reciprocal = _mm_div_ss(one, abs_value); __m128 x = RTM_VECTOR4F_SELECT(is_larger_than_one, reciprocal, abs_value); From 0546dfbf5dddcbc5c0205044c3b94331c7a9d1c7 Mon Sep 17 00:00:00 2001 From: Nicholas Frechette Date: Sun, 17 Dec 2023 23:13:25 -0500 Subject: [PATCH 8/8] fix(scalar): attempt GCC 6 codegen fix --- includes/rtm/scalarf.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h index 794e93d7..8146b7d4 100644 --- a/includes/rtm/scalarf.h +++ b/includes/rtm/scalarf.h @@ -1352,15 +1352,18 @@ namespace rtm // If X < 0.0, we return atan(y/x) + sign(Y) * PI // See: https://en.wikipedia.org/wiki/Atan2#Definition_and_computation + const __m128 y_value = y.value; + const __m128 x_value = x.value; + const __m128 zero = _mm_setzero_ps(); - __m128 is_x_zero = _mm_cmpeq_ss(x.value, zero); - __m128 is_y_zero = _mm_cmpeq_ss(y.value, zero); + __m128 is_x_zero = _mm_cmpeq_ss(x_value, zero); + __m128 is_y_zero = _mm_cmpeq_ss(y_value, zero); __m128 inputs_are_zero = _mm_and_ps(is_x_zero, is_y_zero); - __m128 is_x_positive = _mm_cmpgt_ss(x.value, zero); + __m128 is_x_positive = _mm_cmpgt_ss(x_value, zero); const __m128 sign_mask = _mm_set_ps(-0.0F, -0.0F, -0.0F, -0.0F); - __m128 y_sign = _mm_and_ps(y.value, sign_mask); + __m128 y_sign = _mm_and_ps(y_value, sign_mask); // If X == 0.0, our offset is PI/2 otherwise it is PI both with the sign of Y __m128 half_pi = _mm_set_ps1(constants::half_pi()); @@ -1374,8 +1377,10 @@ namespace rtm // If X == 0.0 and Y == 0.0, our offset is 0.0 offset = _mm_andnot_ps(inputs_are_zero, offset); - __m128 angle = _mm_div_ss(y.value, x.value); - __m128 value = scalar_atan(scalarf{ angle }).value; + __m128 angle = _mm_div_ss(y_value, x_value); + scalarf angle_s = scalarf{ angle }; + scalarf value_s = scalar_atan(angle_s); + __m128 value = value_s.value; // If X == 0.0, our value is 0.0 otherwise it is atan(y/x) value = _mm_or_ps(_mm_and_ps(is_x_zero, zero), _mm_andnot_ps(is_x_zero, value));