From 890d05aeceb7b1afd209a636fa188f57085bbff9 Mon Sep 17 00:00:00 2001
From: Yang Hau <yuanyanghau@gmail.com>
Date: Fri, 28 Jun 2024 19:06:26 +0800
Subject: [PATCH] wip

---
 tests/impl.cpp | 51 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/tests/impl.cpp b/tests/impl.cpp
index 11e8fa05..6f77b5d9 100644
--- a/tests/impl.cpp
+++ b/tests/impl.cpp
@@ -409,6 +409,17 @@ __m128i do_mm_set_epi32(int32_t x, int32_t y, int32_t z, int32_t w)
     return a;
 }
 
+typedef union bit64_union_t {
+    double f64;
+    int64_t i64;
+    uint64_t u64;
+} bit64_union_t;
+typedef union bit32_union_t {
+    float f32;
+    int32_t i32;
+    uint32_t u32;
+} bit32_union_t;
+
 // This function is not called from "runSingleTest", but for other intrinsic
 // tests that might need to load __m64 data.
 template <class T>
@@ -419,10 +430,10 @@ __m64 load_m64(const T *p)
 
 // This function is not called from "runSingleTest", but for other intrinsic
 // tests that might need to call "_mm_load_ps".
-template <class T>
-__m128 load_m128(const T *p)
+// template <class T>
+__m128 load_m128(const float *p)
 {
-    return _mm_loadu_ps((const float *) p);
+    return _mm_loadu_ps(p);
 }
 
 // This function is not called from "runSingleTest", but for other intrinsic
@@ -430,17 +441,17 @@ __m128 load_m128(const T *p)
 template <class T>
 __m128i load_m128i(const T *p)
 {
-    __m128 a = _mm_loadu_ps((const float *) p);
-    __m128i ia = *(const __m128i *) &a;
+    __m128 a = _mm_loadu_si32(p);
+    __m128i ia = _mm_castsi128_ps(a);
     return ia;
 }
 
 // This function is not called from "runSingleTest", but for other intrinsic
 // tests that might need to call "_mm_load_pd".
-template <class T>
-__m128d load_m128d(const T *p)
+// template <class T>
+__m128d load_m128d(const double *p)
 {
-    return _mm_loadu_pd((const double *) p);
+    return _mm_loadu_pd(p);
 }
 
 // This function is not called from "runSingleTest", but for other intrinsic
@@ -3261,8 +3272,8 @@ result_t test_mm_xor_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
     int32_t d2 = _a[2] ^ _b[2];
     int32_t d3 = _a[3] ^ _b[3];
 
-    __m128 a = load_m128(_a);
-    __m128 b = load_m128(_b);
+    __m128 a = load_m128((const float *)_a);
+    __m128 b = load_m128((const float *)_b);
     __m128 c = _mm_xor_ps(a, b);
 
     return validateFloat(c, *((float *) &d0), *((float *) &d1),
@@ -3556,8 +3567,8 @@ result_t test_mm_and_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     int64_t d0 = _a[0] & _b[0];
     int64_t d1 = _a[1] & _b[1];
 
-    __m128d a = load_m128d(_a);
-    __m128d b = load_m128d(_b);
+    __m128d a = load_m128d((const double *)_a);
+    __m128d b = load_m128d((const double *)_b);
     __m128d c = _mm_and_pd(a, b);
 
     return validateDouble(c, *((double *) &d0), *((double *) &d1));
@@ -3690,7 +3701,7 @@ result_t test_mm_bsrli_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
 result_t test_mm_castpd_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const float *_a = impl.mTestFloatPointer1;
-    const __m128d a = load_m128d(_a);
+    const __m128d a = load_m128d((const double *)_a);
     const __m128 _c = load_m128(_a);
 
     __m128 r = _mm_castpd_ps(a);
@@ -3701,7 +3712,7 @@ result_t test_mm_castpd_ps(const SSE2NEONTestImpl &impl, uint32_t iter)
 result_t test_mm_castpd_si128(const SSE2NEONTestImpl &impl, uint32_t iter)
 {
     const float *_a = impl.mTestFloatPointer1;
-    const __m128d a = load_m128d(_a);
+    const __m128d a = load_m128d((const double *)_a);
     const __m128i *_c = (const __m128i *) _a;
 
     __m128i r = _mm_castpd_si128(a);
@@ -5508,8 +5519,8 @@ result_t test_mm_or_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     int64_t d0 = _a[0] | _b[0];
     int64_t d1 = _a[1] | _b[1];
 
-    __m128d a = load_m128d(_a);
-    __m128d b = load_m128d(_b);
+    __m128d a = load_m128d((const double *)_a);
+    __m128d b = load_m128d((const double *)_b);
     __m128d c = _mm_or_pd(a, b);
 
     return validateDouble(c, *((double *) &d0), *((double *) &d1));
@@ -7096,8 +7107,8 @@ result_t test_mm_xor_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     int64_t d0 = _a[0] ^ _b[0];
     int64_t d1 = _a[1] ^ _b[1];
 
-    __m128d a = load_m128d(_a);
-    __m128d b = load_m128d(_b);
+    __m128d a = load_m128d((const double *)_a);
+    __m128d b = load_m128d((const double *)_b);
     __m128d c = _mm_xor_pd(a, b);
 
     return validateDouble(c, *((double *) &d0), *((double *) &d1));
@@ -8095,7 +8106,9 @@ result_t test_mm_blendv_pd(const SSE2NEONTestImpl &impl, uint32_t iter)
     for (int i = 0; i < 2; i++) {
         // signed shift right would return a result which is either all 1's from
         // negative numbers or all 0's from positive numbers
-        if ((*(const int64_t *) (_mask + i)) >> 63) {
+        bit64_union_t m;
+        m.f64 = _mask[i];
+        if (m.i64 >> 63) {
             _c[i] = _b[i];
         } else {
             _c[i] = _a[i];