From 6a0be5eb07df904cfd2ae5a55aa815dbff4ba8b8 Mon Sep 17 00:00:00 2001
From: Aras Pranckevicius <aras@nesnausk.org>
Date: Fri, 8 Dec 2023 16:44:51 +0200
Subject: [PATCH] SIMD: faster vint4 load/store with unsigned char conversion

vint4::load from unsigned char pointer got pre-SSE4 code path. Testing
on Ryzen 5950X / VS2022 (with only SSE2 enabled in the build):
- vint4 load from unsigned char[]: 946.1 -> 4232.8 Mvals/sec

vint4::store to unsigned char pointer got simpler/faster SSE code path,
and a NEON code path. Additionally, it got test correctness coverage,
including what happens to values outside of unsigned char range
(current behavior just masks lowest byte, i.e. does not clamp the
integer lanes).

- vint4 store to unsigned char[]: 3489.8 -> 3979.3 Mvals/sec
- vint8 store to unsigned char[]: 5516.9 -> 7325.3 Mvals/sec

NEON code path as tested on Mac M1 Max (clang 15):
- vint4 store to unsigned char[]: 4137.2 -> 6074.8 Mvals/sec

Signed-off-by: Aras Pranckevicius <aras@nesnausk.org>
---
 src/include/OpenImageIO/simd.h | 25 ++++++++++++++-----------
 src/libutil/simd_test.cpp      |  8 ++++++++
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/include/OpenImageIO/simd.h b/src/include/OpenImageIO/simd.h
index 277ddf9fb2..85dfc419e6 100644
--- a/src/include/OpenImageIO/simd.h
+++ b/src/include/OpenImageIO/simd.h
@@ -4293,6 +4293,11 @@ OIIO_FORCEINLINE void vint4::load (const unsigned char *values) {
     // Trickery: load one float worth of bits = 4 uchars!
     simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
     m_simd = _mm_cvtepu8_epi32 (a);
+#elif OIIO_SIMD_SSE >= 2
+    // Trickery: load one float worth of bits = 4 uchars!
+    simd_t a = _mm_castps_si128 (_mm_load_ss ((const float *)values));
+    a = _mm_unpacklo_epi8(a, _mm_setzero_si128());
+    m_simd = _mm_unpacklo_epi16(a, _mm_setzero_si128());
 #else
     SIMD_CONSTRUCT (values[i]);
 #endif
@@ -4784,17 +4789,15 @@ OIIO_FORCEINLINE void vint4::store (unsigned char *values) const {
 #if OIIO_AVX512VL_ENABLED
     _mm_mask_cvtepi32_storeu_epi8 (values, __mmask8(0xf), m_simd);
 #elif OIIO_SIMD_SSE
-    // Expressed as bytes and considering little endianness, we
-    // currently have AxBxCxDx (the 'x' means don't care).
-    vint4 clamped = m_simd & vint4(0xff);          // A000 B000 C000 D000
-    vint4 swapped = shuffle_sse<1,0,3,2>(clamped); // B000 A000 D000 C000
-    vint4 shifted = swapped << 8;                  // 0B00 0A00 0D00 0C00
-    vint4 merged = clamped | shifted;              // AB00 xxxx CD00 xxxx
-    vint4 merged2 = shuffle_sse<2,2,2,2>(merged);  // CD00 ...
-    vint4 shifted2 = merged2 << 16;                // 00CD ...
-    vint4 result = merged | shifted2;              // ABCD ...
-    memcpy(values, &result, 4);  // memcpy because it may be unaligned
-    // At this point, values[] should hold A,B,C,D
+    vint4 clamped = m_simd & vint4(0xff);                          // A000 B000 C000 D000
+    simd_t val16 = _mm_packus_epi32(clamped, _mm_setzero_si128()); // A0B0 C0D0 xxxx xxxx
+    simd_t val8 = _mm_packus_epi16(val16, _mm_setzero_si128());    // ABCD xxxx xxxx xxxx
+    _mm_store_ss((float*)values, _mm_castsi128_ps(val8));
+#elif OIIO_SIMD_NEON
+    vint4 clamped = m_simd & vint4(0xff);
+    simd_t val16 = vcombine_s16(vqmovn_s32(clamped), vdup_n_s16(0));
+    simd_t val8 = vcombine_u8(vqmovun_s16(val16), vdup_n_u8(0));
+    vst1q_lane_u32((uint32_t*)values, val8, 0);
 #else
     SIMD_DO (values[i] = m_val[i]);
 #endif
diff --git a/src/libutil/simd_test.cpp b/src/libutil/simd_test.cpp
index 5e056d9d11..e94f3a1305 100644
--- a/src/libutil/simd_test.cpp
+++ b/src/libutil/simd_test.cpp
@@ -506,6 +506,14 @@ void test_conversion_loadstore_int ()
     OIIO_CHECK_SIMD_EQUAL (VEC(uc1234), C1234);
     OIIO_CHECK_SIMD_EQUAL (VEC( c1234), C1234);
 
+    // Check store to integers
+    VEC CStep = VEC::Iota(-130, 131);
+    unsigned char ucStepExp[]  = {126, 1, 132, 7, 138, 13, 144, 19, 150, 25, 156, 31, 162, 37, 168, 43};
+    unsigned char ucStepGot[VEC::elements] = {};
+    CStep.store(ucStepGot);
+    for (int i = 0; i < VEC::elements; ++i)
+        OIIO_CHECK_EQUAL ((int)ucStepGot[i], (int)ucStepExp[i]);
+
     benchmark ("load from int[]", [](const int *d){ return VEC(d); }, i1234);
     benchmark ("load from unsigned short[]", [](const unsigned short *d){ return VEC(d); }, us1234);
     benchmark ("load from short[]", [](const short *d){ return VEC(d); }, s1234);