From 7159d1bdc6b488ca312717be8a3c14eab908be9f Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:29:30 -0500
Subject: [PATCH 1/8] style(scalar): use capital F for floating point constant

---
 includes/rtm/scalarf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index bb85c12f..f04a1693 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -344,7 +344,7 @@ namespace rtm
 #if defined(RTM_SSE2_INTRINSICS)
 		return scalar_cast(scalar_reciprocal(scalar_set(input)));
 #else
-		return 1.0f / input;
+		return 1.0F / input;
 #endif
 	}
 

From c09180fe82aa852d69800435ca258d77091245ad Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:35:42 -0500
Subject: [PATCH 2/8] docs: add performance note

---
 includes/rtm/scalarf.h  | 5 +++++
 includes/rtm/vector4d.h | 5 +++++
 includes/rtm/vector4f.h | 5 +++++
 3 files changed, 15 insertions(+)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index f04a1693..a5570820 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -344,6 +344,11 @@ namespace rtm
 #if defined(RTM_SSE2_INTRINSICS)
 		return scalar_cast(scalar_reciprocal(scalar_set(input)));
 #else
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
 		return 1.0F / input;
 #endif
 	}
diff --git a/includes/rtm/vector4d.h b/includes/rtm/vector4d.h
index 52e4e32d..b8983c46 100644
--- a/includes/rtm/vector4d.h
+++ b/includes/rtm/vector4d.h
@@ -977,6 +977,11 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_reciprocal(const vector4d& input) RTM_NO_EXCEPT
 	{
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
 		return vector_div(vector_set(1.0), input);
 	}
 
diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h
index 06e68187..2f5b5f37 100644
--- a/includes/rtm/vector4f.h
+++ b/includes/rtm/vector4f.h
@@ -1123,6 +1123,11 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT
 	{
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
 #if defined(RTM_SSE2_INTRINSICS)
 		return _mm_div_ps(_mm_set_ps1(1.0F), input);
 #elif defined(RTM_NEON64_INTRINSICS)

From 15070b78eb28699e276fa3ca5dea7259aaa1c5ff Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:36:16 -0500
Subject: [PATCH 3/8] fix(scalar): use C++ scalar code to avoid intrinsic
 roundtrip overhead

---
 includes/rtm/scalarf.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index a5570820..5c921916 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -341,16 +341,12 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_reciprocal(float input) RTM_NO_EXCEPT
 	{
-#if defined(RTM_SSE2_INTRINSICS)
-		return scalar_cast(scalar_reciprocal(scalar_set(input)));
-#else
 		// Performance note:
 		// With modern out-of-order executing processors, it is typically faster to use
 		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
 		// because the resulting code is more dense and is more likely to inline and
 		// as it uses fewer instructions.
 		return 1.0F / input;
-#endif
 	}
 
 #if defined(RTM_SSE2_INTRINSICS)

From 4a2e61498d12bf1ca395a8f69486f54cc95650e9 Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:38:37 -0500
Subject: [PATCH 4/8] fix(scalar): use div/sqrt for scalar reciprocal square
 root

---
 includes/rtm/scalarf.h | 37 ++++++-------------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index 5c921916..9a46278b 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -282,36 +282,10 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	// Returns the reciprocal square root of the input.
 	//////////////////////////////////////////////////////////////////////////
-	#if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS)
-		// HACK!!! Visual Studio 2019 has a code generation bug triggered by the code below, disable optimizations for now
-		// Bug only happens with x64 SSE2, not with AVX nor with x86
-		// Fixed in 16.5.4, see https://github.com/nfrechette/rtm/issues/35
-		// TODO: Remove this hack sometime in 2022 or later once the fix is old enough that we no longer have to support the hack
-		#pragma optimize("", off)
-	#endif
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE scalarf RTM_SIMD_CALL scalar_sqrt_reciprocal(scalarf_arg0 input) RTM_NO_EXCEPT
 	{
-		// Perform two passes of Newton-Raphson iteration on the hardware estimate
-		const __m128 half = _mm_set_ss(0.5F);
-		const __m128 input_half = _mm_mul_ss(input.value, half);
-		const __m128 x0 = _mm_rsqrt_ss(input.value);
-
-		// First iteration
-		__m128 x1 = _mm_mul_ss(x0, x0);
-		x1 = _mm_sub_ss(half, _mm_mul_ss(input_half, x1));
-		x1 = _mm_add_ss(_mm_mul_ss(x0, x1), x0);
-
-		// Second iteration
-		__m128 x2 = _mm_mul_ss(x1, x1);
-		x2 = _mm_sub_ss(half, _mm_mul_ss(input_half, x2));
-		x2 = _mm_add_ss(_mm_mul_ss(x1, x2), x1);
-
-		return scalarf{ x2 };
+		return scalarf{ _mm_div_ss(_mm_set_ps1(1.0F), _mm_sqrt_ss(input.value)) };
 	}
-	#if defined(RTM_COMPILER_MSVC) && _MSC_VER >= 1920 && _MSC_VER < 1925 && defined(RTM_ARCH_X64) && !defined(RTM_AVX_INTRINSICS)
-		// HACK!!! See comment above
-		#pragma optimize("", on)
-	#endif
 #endif
 
 	//////////////////////////////////////////////////////////////////////////
@@ -319,11 +293,12 @@ namespace rtm
 	//////////////////////////////////////////////////////////////////////////
 	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE float RTM_SIMD_CALL scalar_sqrt_reciprocal(float input) RTM_NO_EXCEPT
 	{
-#if defined(RTM_SSE2_INTRINSICS)
-		return scalar_cast(scalar_sqrt_reciprocal(scalar_set(input)));
-#else
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
 		return 1.0F / scalar_sqrt(input);
-#endif
 	}
 
 #if defined(RTM_SSE2_INTRINSICS)

From 07b494e640af3d9f439a32e5017de5d555716fb0 Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:40:18 -0500
Subject: [PATCH 5/8] docs(scalar): fix performance note wording

---
 includes/rtm/scalarf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index 9a46278b..49924d2f 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -295,7 +295,7 @@ namespace rtm
 	{
 		// Performance note:
 		// With modern out-of-order executing processors, it is typically faster to use
-		// a full division instead of a reciprocal estimate + Newton-Raphson iterations
+		// a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations
 		// because the resulting code is more dense and is more likely to inline and
 		// as it uses fewer instructions.
 		return 1.0F / scalar_sqrt(input);

From 561aefdb4c3581109a2195779d7b6ccf45ce8625 Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 15:40:51 -0500
Subject: [PATCH 6/8] feat(vector4): add vector_sqrt_reciprocal

---
 includes/rtm/vector4d.h           | 13 +++++++++++++
 includes/rtm/vector4f.h           | 13 +++++++++++++
 tests/sources/test_vector4_impl.h |  5 +++++
 3 files changed, 31 insertions(+)

diff --git a/includes/rtm/vector4d.h b/includes/rtm/vector4d.h
index b8983c46..c6f6322d 100644
--- a/includes/rtm/vector4d.h
+++ b/includes/rtm/vector4d.h
@@ -1001,6 +1001,19 @@ namespace rtm
 #endif
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// Per component reciprocal square root of the input: 1.0 / sqrt(input)
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4d vector_sqrt_reciprocal(const vector4d& input) RTM_NO_EXCEPT
+	{
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
+		return vector_reciprocal(vector_sqrt(input));
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Per component returns the smallest integer value not less than the input (round towards positive infinity).
 	// vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0]
diff --git a/includes/rtm/vector4f.h b/includes/rtm/vector4f.h
index 2f5b5f37..ac3c5cfa 100644
--- a/includes/rtm/vector4f.h
+++ b/includes/rtm/vector4f.h
@@ -1165,6 +1165,19 @@ namespace rtm
 #endif
 	}
 
+	//////////////////////////////////////////////////////////////////////////
+	// Per component reciprocal square root of the input: 1.0 / sqrt(input)
+	//////////////////////////////////////////////////////////////////////////
+	RTM_DISABLE_SECURITY_COOKIE_CHECK RTM_FORCE_INLINE vector4f RTM_SIMD_CALL vector_sqrt_reciprocal(vector4f_arg0 input) RTM_NO_EXCEPT
+	{
+		// Performance note:
+		// With modern out-of-order executing processors, it is typically faster to use
+		// a full division/square root instead of a reciprocal estimate + Newton-Raphson iterations
+		// because the resulting code is more dense and is more likely to inline and
+		// as it uses fewer instructions.
+		return vector_reciprocal(vector_sqrt(input));
+	}
+
 	//////////////////////////////////////////////////////////////////////////
 	// Per component returns the smallest integer value not less than the input (round towards positive infinity).
 	// vector_ceil([1.8, 1.0, -1.8, -1.0]) = [2.0, 1.0, -1.0, -1.0]
diff --git a/tests/sources/test_vector4_impl.h b/tests/sources/test_vector4_impl.h
index 6e64eebb..ed97da45 100644
--- a/tests/sources/test_vector4_impl.h
+++ b/tests/sources/test_vector4_impl.h
@@ -547,6 +547,11 @@ void test_vector4_arithmetic_impl(const FloatType threshold)
 	CHECK(scalar_near_equal(vector_get_z(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[2])), threshold));
 	CHECK(scalar_near_equal(vector_get_w(vector_sqrt(vector_abs(test_value0))), scalar_sqrt(scalar_abs(test_value0_flt[3])), threshold));
 
+	CHECK(scalar_near_equal(vector_get_x(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[0]))), threshold));
+	CHECK(scalar_near_equal(vector_get_y(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[1]))), threshold));
+	CHECK(scalar_near_equal(vector_get_z(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[2]))), threshold));
+	CHECK(scalar_near_equal(vector_get_w(vector_sqrt_reciprocal(vector_abs(test_value0))), scalar_reciprocal(scalar_sqrt(scalar_abs(test_value0_flt[3]))), threshold));
+
 	const Vector4Type neg_zero = vector_set(FloatType(-0.0));
 	CHECK(FloatType(vector_get_x(vector_floor(neg_zero))) == scalar_floor(FloatType(-0.0)));
 	CHECK(FloatType(vector_get_x(vector_floor(test_value0))) == scalar_floor(test_value0_flt[0]));

From add676c3fae31dae0758a515752e9a1427a765a5 Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 22:34:00 -0500
Subject: [PATCH 7/8] fix(scalar): inline scalar_reciprocal to avoid GCC 6
 codegen bug

---
 includes/rtm/scalarf.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index 49924d2f..794e93d7 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -1267,8 +1267,9 @@ namespace rtm
 		__m128 abs_value = _mm_and_ps(value.value, _mm_castsi128_ps(abs_mask));
 
 		// Compute our value
-		__m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, _mm_set_ps1(1.0F));
-		__m128 reciprocal = scalar_reciprocal(scalarf{ abs_value }).value;
+		const __m128 one = _mm_set_ps1(1.0F);
+		__m128 is_larger_than_one = _mm_cmpgt_ss(abs_value, one);
+		__m128 reciprocal = _mm_div_ss(one, abs_value);
 
 		__m128 x = RTM_VECTOR4F_SELECT(is_larger_than_one, reciprocal, abs_value);
 

From 0546dfbf5dddcbc5c0205044c3b94331c7a9d1c7 Mon Sep 17 00:00:00 2001
From: Nicholas Frechette <zeno490@gmail.com>
Date: Sun, 17 Dec 2023 23:13:25 -0500
Subject: [PATCH 8/8] fix(scalar): attempt GCC 6 codegen fix

---
 includes/rtm/scalarf.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/includes/rtm/scalarf.h b/includes/rtm/scalarf.h
index 794e93d7..8146b7d4 100644
--- a/includes/rtm/scalarf.h
+++ b/includes/rtm/scalarf.h
@@ -1352,15 +1352,18 @@ namespace rtm
 		// If X < 0.0, we return atan(y/x) + sign(Y) * PI
 		// See: https://en.wikipedia.org/wiki/Atan2#Definition_and_computation
 
+		const __m128 y_value = y.value;
+		const __m128 x_value = x.value;
+
 		const __m128 zero = _mm_setzero_ps();
-		__m128 is_x_zero = _mm_cmpeq_ss(x.value, zero);
-		__m128 is_y_zero = _mm_cmpeq_ss(y.value, zero);
+		__m128 is_x_zero = _mm_cmpeq_ss(x_value, zero);
+		__m128 is_y_zero = _mm_cmpeq_ss(y_value, zero);
 		__m128 inputs_are_zero = _mm_and_ps(is_x_zero, is_y_zero);
 
-		__m128 is_x_positive = _mm_cmpgt_ss(x.value, zero);
+		__m128 is_x_positive = _mm_cmpgt_ss(x_value, zero);
 
 		const __m128 sign_mask = _mm_set_ps(-0.0F, -0.0F, -0.0F, -0.0F);
-		__m128 y_sign = _mm_and_ps(y.value, sign_mask);
+		__m128 y_sign = _mm_and_ps(y_value, sign_mask);
 
 		// If X == 0.0, our offset is PI/2 otherwise it is PI both with the sign of Y
 		__m128 half_pi = _mm_set_ps1(constants::half_pi());
@@ -1374,8 +1377,10 @@ namespace rtm
 		// If X == 0.0 and Y == 0.0, our offset is 0.0
 		offset = _mm_andnot_ps(inputs_are_zero, offset);
 
-		__m128 angle = _mm_div_ss(y.value, x.value);
-		__m128 value = scalar_atan(scalarf{ angle }).value;
+		__m128 angle = _mm_div_ss(y_value, x_value);
+		scalarf angle_s = scalarf{ angle };
+		scalarf value_s = scalar_atan(angle_s);
+		__m128 value = value_s.value;
 
 		// If X == 0.0, our value is 0.0 otherwise it is atan(y/x)
 		value = _mm_or_ps(_mm_and_ps(is_x_zero, zero), _mm_andnot_ps(is_x_zero, value));