Added AVX512 support for SkRasterPipeline_opts.h

pingladd · pingladd · commit c9c9ff76d76e · 2023-12-18T20:47:24.000-06:00
Hi, The patch has integrated AVX512 support for certain functions in both highp and lowp. Testing and verification were conducted within the Pdfium repository, where it passed the pdfium_embeddertests.exe. Performance-wise, the AVX512 code path shows significant enhancement over the standard SSE and AVX2 paths. This performance boost was confirmed through testing with PDF files sourced from the resources folder of the Pdfium library. This is an imported pull request from google#149 GitOrigin-RevId: 3dfeb3b Change-Id: I91f95a69d914ed57707239b7d2257a6c8f0c3ffa This is an imported pull request from google#151 GitOrigin-RevId: 02db57e Change-Id: Ia674977e3c1a083938bbfda1e9d785595896cb88
diff --git a/src/core/SkRasterPipelineOpContexts.h b/src/core/SkRasterPipelineOpContexts.h
@@ -19,7 +19,7 @@ namespace SkSL { class TraceHook; }
 // by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
 // save memory in the arena.
 inline static constexpr int SkRasterPipeline_kMaxStride = 16;
-inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;
+inline static constexpr int SkRasterPipeline_kMaxStride_highp = 16;
 
 // How much space to allocate for each MemoryCtx scratch buffer, as part of tail-pixel handling.
 inline static constexpr size_t SkRasterPipeline_MaxScratchPerPatch =
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
@@ -59,6 +59,8 @@ using NoCtx = const void*;
     #define JUMPER_IS_SCALAR
 #elif defined(SK_ARM_HAS_NEON)
     #define JUMPER_IS_NEON
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+    #define JUMPER_IS_AVX512
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
     #define JUMPER_IS_HSW
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
@@ -291,6 +293,223 @@ namespace SK_OPTS_NS {
     SI void store4(float* ptr, F r, F g, F b, F a) {
         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
+#elif defined(JUMPER_IS_AVX512)
+template <typename T> using V = T __attribute__((ext_vector_type(16)));
+    using F   = V<float   >;
+    using I32 = V< int32_t>;
+    using U64 = V<uint64_t>;
+    using U32 = V<uint32_t>;
+    using U16 = V<uint16_t>;
+    using U8  = V<uint8_t >;
+
+    SI F   mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
+    SI F   min(F a, F b)     { return _mm512_min_ps(a,b);    }
+    SI I32 min(I32 a, I32 b) { return _mm512_min_epi32(a,b); }
+    SI U32 min(U32 a, U32 b) { return _mm512_min_epu32(a,b); }
+    SI F   max(F a, F b)     { return _mm512_max_ps(a,b);    }
+    SI I32 max(I32 a, I32 b) { return _mm512_max_epi32(a,b); }
+    SI U32 max(U32 a, U32 b) { return _mm512_max_epu32(a,b); }
+    SI F   abs_  (F v)   { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
+    SI I32 abs_  (I32 v) { return _mm512_abs_epi32(v);   }
+    SI F   floor_(F v)   { return _mm512_floor_ps(v);    }
+    SI F   ceil_(F v)    { return _mm512_ceil_ps(v);     }
+    SI F   rcp_approx(F v) { return _mm512_rcp14_ps  (v);  }
+    SI F   rsqrt_approx (F v)   { return _mm512_rsqrt14_ps(v);  }
+    SI F   sqrt_ (F v)   { return _mm512_sqrt_ps (v);    }
+    SI F rcp_precise (F v) {
+        F e = rcp_approx(v);
+        return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
+    }
+    SI U32 round(F v)          { return _mm512_cvtps_epi32(v); }
+    SI U32 round(F v, F scale) { return _mm512_cvtps_epi32(v*scale); }
+    SI U16 pack(U32 v) {
+        __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256(v),
+                                _mm512_extracti64x4_epi64(v, 1));
+        return _mm256_permutex_epi64(rst, 216);
+    }
+    SI U8 pack(U16 v) {
+        __m256i rst = _mm256_packus_epi16(v, v);
+        return _mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
+    }
+    SI F if_then_else(I32 c, F t, F e) {
+        return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
+                     _mm512_castps_si512(e), 202));
+    }
+    SI bool any(I32 c) {
+        __mmask16 mask32 = _mm512_test_epi32_mask(c, c);
+        return mask32 != 0;
+    }
+    SI bool all(I32 c) {
+        __mmask16 mask32 = _mm512_test_epi32_mask(c, c);
+        return mask32 == 0xffff;
+    }
+    template <typename T>
+    SI V<T> gather(const T* p, U32 ix) {
+        return{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+               p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
+               p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
+               p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
+               };
+    }
+    SI F   gather(const float* p, U32 ix) { return _mm512_i32gather_ps(ix, p, 4); }
+    SI U32 gather(const uint32_t* p, U32 ix) { return _mm512_i32gather_epi32(ix, p, 4); }
+    SI U64 gather(const uint64_t* p, U32 ix) {
+        __m512i parts[] = {
+            _mm512_i32gather_epi64(_mm512_castsi512_si256(ix), p, 8),
+            _mm512_i32gather_epi64(_mm512_extracti32x8_epi32(ix, 1), p, 8),
+        };
+        return sk_bit_cast<U64>(parts);
+    }
+    template <typename V, typename S>
+    SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
+        V before = gather(dst, ix);
+        V after = if_then_else(mask, src, before);
+        dst[ix[0]] = after[0];
+        dst[ix[1]] = after[1];
+        dst[ix[2]] = after[2];
+        dst[ix[3]] = after[3];
+        dst[ix[4]] = after[4];
+        dst[ix[5]] = after[5];
+        dst[ix[6]] = after[6];
+        dst[ix[7]] = after[7];
+        dst[ix[8]] = after[8];
+        dst[ix[9]] = after[9];
+        dst[ix[10]] = after[10];
+        dst[ix[11]] = after[11];
+        dst[ix[12]] = after[12];
+        dst[ix[13]] = after[13];
+        dst[ix[14]] = after[14];
+        dst[ix[15]] = after[15];
+    }
+
+    SI void load2(const uint16_t* ptr, U16* r, U16* g) {
+        U16 _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
+        U16 _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);
+
+        *r = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
+            (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
+        *g = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
+                             _mm256_srai_epi32(_89abcdef, 16)), 216);
+    }
+    SI void store2(uint16_t* ptr, U16 r, U16 g) {
+        auto _01234567 = _mm256_unpacklo_epi16(r, g);
+        auto _89abcdef = _mm256_unpackhi_epi16(r, g);
+        __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
+                    _89abcdef, 1);
+        __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
+        _01234567 = _mm512_castsi512_si256(aa);
+        _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
+
+        _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
+        _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
+    }
+
+    SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
+        __m512i _01234567 = _mm512_loadu_si512((__m512i*)ptr);
+        __m512i _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));
+
+        *r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567,
+                _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
+                  _89abcdef, _mm512_set1_epi64(0xFF))));
+        *g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 16), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
+        *b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 32), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
+        *a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 48), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
+    }
+    SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
+        auto rg012389ab = _mm256_unpacklo_epi16(r, g),
+             rg4567cdef = _mm256_unpackhi_epi16(r, g),
+             ba012389ab = _mm256_unpacklo_epi16(b, a),
+             ba4567cdef = _mm256_unpackhi_epi16(b, a);
+
+        auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
+             _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
+             _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
+             _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
+
+        auto _ab23 = _mm256_permutex_epi64(_23ab, 78);
+        auto _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0);
+        auto _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78);
+        auto _ef67 = _mm256_permutex_epi64(_67ef, 78);
+        auto _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0);
+        auto _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
+
+        _mm256_storeu_si256((__m256i*)ptr, _0123);
+        _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
+        _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
+        _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
+    }
+
+    SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
+        F _048c, _159d, _26ae, _37bf;
+
+        _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr)         );
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
+        _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4)       );
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
+        _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8)       );
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
+        _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12)      );
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
+       
+        F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
+          ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
+          rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
+          ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
+
+        *r = _mm512_unpacklo_ps(rg02468acf, rg13579bde);
+        *g = _mm512_unpackhi_ps(rg02468acf, rg13579bde);
+        *b = _mm512_unpacklo_ps(ba02468acf, ba13579bde);
+        *a = _mm512_unpackhi_ps(ba02468acf, ba13579bde);
+    }
+    
+    SI void store4(float* ptr, F r, F g, F b, F a) {
+        F rg014589cd = _mm512_unpacklo_ps(r, g),
+          rg2367abef = _mm512_unpackhi_ps(r, g),
+          ba014589cd = _mm512_unpacklo_ps(b, a),
+          ba2367abef = _mm512_unpackhi_ps(b, a);
+
+        F _048c = _mm512_unpacklo_pd(rg014589cd, ba014589cd), // r0 g0 b0 a0 4 8 c
+          _26ae = _mm512_unpacklo_pd(rg2367abef, ba2367abef), // r2 g2 b2 a2 6 a e
+          _159d = _mm512_unpackhi_pd(rg014589cd, ba014589cd), // r1 g1 b1 a1 5 9 d
+          _37bf = _mm512_unpackhi_pd(rg2367abef, ba2367abef); // r3 g3 b3 a3 7 b f
+
+
+        F _ae26 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _26ae),
+          _bf37 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _37bf),
+          _8c04 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _048c),
+          _9d15 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _159d),
+
+          _0426 = _mm512_permutex2var_pd(_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _ae26),
+          _1537 = _mm512_permutex2var_pd(_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _bf37),
+          _5173 = _mm512_permutex_pd(_1537, 176),
+          _0123 = _mm512_permutex2var_pd(_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _5173),
+          _5476 = _mm512_permutex2var_pd(_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _0426),
+          _4567 = _mm512_permutex_pd(_5476, 176),
+          _8cae = _mm512_permutex2var_pd(_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _26ae),
+          _9dbf = _mm512_permutex2var_pd(_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _37bf),
+          _d9fb = _mm512_permutex_pd(_9dbf, 176),
+          _89ab = _mm512_permutex2var_pd(_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _d9fb),
+          _dcfe = _mm512_permutex2var_pd(_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _8cae),
+          _cdef = _mm512_permutex_pd(_dcfe, 176);
+        _mm512_storeu_ps(ptr+0, _0123);
+        _mm512_storeu_ps(ptr+16, _4567);
+        _mm512_storeu_ps(ptr+32, _89ab);
+        _mm512_storeu_ps(ptr+48, _cdef);
+    }
 
 #elif defined(JUMPER_IS_HSW)
     // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -631,6 +850,12 @@ template <typename T> using V = T __attribute__((ext_vector_type(4)));
     SI U32 trunc_(F   v) { return (U32)v; }
     SI U32 expand(U16 v) { return (U32)v; }
     SI U32 expand(U8  v) { return (U32)v; }
+#elif defined (JUMPER_IS_AVX512)
+    SI F   cast  (U32 v) { return      _mm512_cvtepu32_ps(v); }
+    SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); }
+    SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
+    SI U32 expand(U16 v) { return      _mm512_cvtepu16_epi32(v); }
+    SI U32 expand(U8  v) { return      _mm512_cvtepu8_epi32(v); }
 #else
     SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
     SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); }
@@ -692,6 +917,9 @@ SI F from_half(U16 h) {
     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
     return vcvt_f32_f16(h);
 
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_cvtph_ps(h);
+
 #elif defined(JUMPER_IS_HSW)
     return _mm256_cvtph_ps(h);
 
@@ -713,6 +941,9 @@ SI U16 to_half(F f) {
     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
     return vcvt_f16_f32(f);
 
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+
 #elif defined(JUMPER_IS_HSW)
     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
 
@@ -4162,7 +4393,7 @@ namespace lowp {
 
 #else  // We are compiling vector code with Clang... let's make some lowp stages!
 
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512) || defined(JUMPER_IS_HSW)
     using U8  = uint8_t  __attribute__((ext_vector_type(16)));
     using U16 = uint16_t __attribute__((ext_vector_type(16)));
     using I16 =  int16_t __attribute__((ext_vector_type(16)));
@@ -4440,7 +4671,10 @@ SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
 
 // Use approximate instructions and one Newton-Raphson step to calculate 1/x.
 SI F rcp_precise(F x) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    F e = _mm512_rcp14_ps(x);
+    return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
+#elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
     return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
@@ -4457,7 +4691,9 @@ SI F rcp_precise(F x) {
 #endif
 }
 SI F sqrt_(F x) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    return _mm512_sqrt_ps(x);
+#elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
     return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
@@ -4492,6 +4728,8 @@ SI F floor_(F x) {
     float32x4_t lo,hi;
     split(x, &lo,&hi);
     return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_floor_ps(x);
 #elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
@@ -4512,7 +4750,9 @@ SI F floor_(F x) {
 // The result is a number on [-1, 1).
 // Note: on neon this is a saturating multiply while the others are not.
 SI I16 scaled_mult(I16 a, I16 b) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    return _mm256_mulhrs_epi16(a, b);
+#elif defined(JUMPER_IS_HSW)
     return _mm256_mulhrs_epi16(a, b);
 #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
     return _mm_mulhrs_epi16(a, b);
@@ -4786,7 +5026,25 @@ SI void store(T* ptr, V v) {
     memcpy(ptr, &v, sizeof(v));
 }
 
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    template <typename V, typename T>
+    SI V gather(const T* ptr, U32 ix) {
+        return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+                  ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
+                  ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
+                  ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+    }
+
+    template<>
+    F gather(const float* ptr, U32 ix) {
+        return _mm512_i32gather_ps(ix, ptr, 4);
+    }
+
+    template<>
+    U32 gather(const uint32_t* ptr, U32 ix) {
+        return _mm512_i32gather_epi32(ix, ptr, 4);
+    }
+#elif defined(JUMPER_IS_HSW)
     template <typename V, typename T>
     SI V gather(const T* ptr, U32 ix) {
         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
@@ -4824,7 +5082,12 @@ SI void store(T* ptr, V v) {
 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
 
 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    rgba = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), rgba);
+    auto cast_U16 = [](U32 v) -> U16 {
+        return _mm256_packus_epi32(_mm512_castsi512_si256(v), _mm512_extracti64x4_epi64(v, 1));
+    };
+#elif defined(JUMPER_IS_HSW)
     // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
     __m256i _01,_23;
     split(rgba, &_01, &_23);