@@ -59,6 +59,8 @@ using NoCtx = const void*;
5959 #define JUMPER_IS_SCALAR
6060#elif defined(SK_ARM_HAS_NEON)
6161 #define JUMPER_IS_NEON
62+ #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
63+ #define JUMPER_IS_AVX512
6264#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
6365 #define JUMPER_IS_HSW
6466#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
@@ -291,6 +293,223 @@ namespace SK_OPTS_NS {
291293 SI void store4 (float * ptr, F r, F g, F b, F a) {
292294 vst4q_f32 (ptr, (float32x4x4_t {{r,g,b,a}}));
293295 }
296+ #elif defined(JUMPER_IS_AVX512)
297+ template <typename T> using V = T __attribute__ ((ext_vector_type(16 )));
298+ using F = V<float >;
299+ using I32 = V< int32_t >;
300+ using U64 = V<uint64_t >;
301+ using U32 = V<uint32_t >;
302+ using U16 = V<uint16_t >;
303+ using U8 = V<uint8_t >;
304+
305+ SI F mad (F f, F m, F a) { return _mm512_fmadd_ps (f, m, a); }
306+ SI F min (F a, F b) { return _mm512_min_ps (a,b); }
307+ SI I32 min (I32 a, I32 b) { return _mm512_min_epi32 (a,b); }
308+ SI U32 min (U32 a, U32 b) { return _mm512_min_epu32 (a,b); }
309+ SI F max (F a, F b) { return _mm512_max_ps (a,b); }
310+ SI I32 max (I32 a, I32 b) { return _mm512_max_epi32 (a,b); }
311+ SI U32 max (U32 a, U32 b) { return _mm512_max_epu32 (a,b); }
312+ SI F abs_ (F v) { return _mm512_and_ps (v, _mm512_sub_ps (_mm512_setzero (), v)); }
313+ SI I32 abs_ (I32 v) { return _mm512_abs_epi32 (v); }
314+ SI F floor_ (F v) { return _mm512_floor_ps (v); }
315+ SI F ceil_ (F v) { return _mm512_ceil_ps (v); }
316+ SI F rcp_approx (F v) { return _mm512_rcp14_ps (v); }
317+ SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps (v); }
318+ SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
319+ SI F rcp_precise (F v) {
320+ F e = rcp_approx (v);
321+ return _mm512_fnmadd_ps (v, e, _mm512_set1_ps (2 .0f )) * e;
322+ }
323+ SI U32 round (F v) { return _mm512_cvtps_epi32 (v); }
324+ SI U32 round (F v, F scale) { return _mm512_cvtps_epi32 (v*scale); }
325+ SI U16 pack (U32 v) {
326+ __m256i rst = _mm256_packus_epi32 (_mm512_castsi512_si256 (v),
327+ _mm512_extracti64x4_epi64 (v, 1 ));
328+ return _mm256_permutex_epi64 (rst, 216 );
329+ }
330+ SI U8 pack (U16 v) {
331+ __m256i rst = _mm256_packus_epi16 (v, v);
332+ return _mm256_castsi256_si128 (_mm256_permute4x64_epi64 (rst, 8 ));
333+ }
334+ SI F if_then_else (I32 c, F t, F e) {
335+ return _mm512_castsi512_ps (_mm512_ternarylogic_epi64 (c, _mm512_castps_si512 (t),
336+ _mm512_castps_si512 (e), 202 ));
337+ }
338+ SI bool any (I32 c) {
339+ __mmask16 mask32 = _mm512_test_epi32_mask (c, c);
340+ return mask32 != 0 ;
341+ }
342+ SI bool all (I32 c) {
343+ __mmask16 mask32 = _mm512_test_epi32_mask (c, c);
344+ return mask32 == 0xffff ;
345+ }
346+ template <typename T>
347+ SI V<T> gather (const T* p, U32 ix) {
348+ return {p[ix[0 ]], p[ix[1 ]], p[ix[2 ]], p[ix[3 ]],
349+ p[ix[4 ]], p[ix[5 ]], p[ix[6 ]], p[ix[7 ]],
350+ p[ix[8 ]], p[ix[9 ]], p[ix[10 ]], p[ix[11 ]],
351+ p[ix[12 ]], p[ix[13 ]], p[ix[14 ]], p[ix[15 ]],
352+ };
353+ }
354+ SI F gather (const float * p, U32 ix) { return _mm512_i32gather_ps (ix, p, 4 ); }
355+ SI U32 gather (const uint32_t * p, U32 ix) { return _mm512_i32gather_epi32 (ix, p, 4 ); }
356+ SI U64 gather (const uint64_t * p, U32 ix) {
357+ __m512i parts[] = {
358+ _mm512_i32gather_epi64 (_mm512_castsi512_si256 (ix), p, 8 ),
359+ _mm512_i32gather_epi64 (_mm512_extracti32x8_epi32 (ix, 1 ), p, 8 ),
360+ };
361+ return sk_bit_cast<U64>(parts);
362+ }
363+ template <typename V, typename S>
364+ SI void scatter_masked (V src, S* dst, U32 ix, I32 mask) {
365+ V before = gather (dst, ix);
366+ V after = if_then_else (mask, src, before);
367+ dst[ix[0 ]] = after[0 ];
368+ dst[ix[1 ]] = after[1 ];
369+ dst[ix[2 ]] = after[2 ];
370+ dst[ix[3 ]] = after[3 ];
371+ dst[ix[4 ]] = after[4 ];
372+ dst[ix[5 ]] = after[5 ];
373+ dst[ix[6 ]] = after[6 ];
374+ dst[ix[7 ]] = after[7 ];
375+ dst[ix[8 ]] = after[8 ];
376+ dst[ix[9 ]] = after[9 ];
377+ dst[ix[10 ]] = after[10 ];
378+ dst[ix[11 ]] = after[11 ];
379+ dst[ix[12 ]] = after[12 ];
380+ dst[ix[13 ]] = after[13 ];
381+ dst[ix[14 ]] = after[14 ];
382+ dst[ix[15 ]] = after[15 ];
383+ }
384+
385+ SI void load2 (const uint16_t * ptr, U16* r, U16* g) {
386+ U16 _01234567 = _mm256_loadu_si256 (((__m256i*)ptr) + 0 );
387+ U16 _89abcdef = _mm256_loadu_si256 (((__m256i*)ptr) + 1 );
388+
389+ *r = _mm256_permute4x64_epi64 (_mm256_packs_epi32 (_mm256_srai_epi32 (_mm256_slli_epi32
390+ (_01234567, 16 ), 16 ), _mm256_srai_epi32 (_mm256_slli_epi32 (_89abcdef, 16 ), 16 )), 216 );
391+ *g = _mm256_permute4x64_epi64 (_mm256_packs_epi32 (_mm256_srai_epi32 (_01234567, 16 ),
392+ _mm256_srai_epi32 (_89abcdef, 16 )), 216 );
393+ }
394+ SI void store2 (uint16_t * ptr, U16 r, U16 g) {
395+ auto _01234567 = _mm256_unpacklo_epi16 (r, g);
396+ auto _89abcdef = _mm256_unpackhi_epi16 (r, g);
397+ __m512i combinedVector = _mm512_inserti64x4 (_mm512_castsi256_si512 (_01234567),
398+ _89abcdef, 1 );
399+ __m512i aa = _mm512_permutexvar_epi64 (_mm512_setr_epi64 (0 ,1 ,4 ,5 ,2 ,3 ,6 ,7 ), combinedVector);
400+ _01234567 = _mm512_castsi512_si256 (aa);
401+ _89abcdef = _mm512_extracti64x4_epi64 (aa, 1 );
402+
403+ _mm256_storeu_si256 ((__m256i*)ptr + 0 , _01234567);
404+ _mm256_storeu_si256 ((__m256i*)ptr + 1 , _89abcdef);
405+ }
406+
407+ SI void load4 (const uint16_t * ptr, U16* r, U16* g, U16* b, U16* a) {
408+ __m512i _01234567 = _mm512_loadu_si512 ((__m512i*)ptr);
409+ __m512i _89abcdef = _mm512_loadu_si512 ((__m512i*)(ptr+32 ));
410+
411+ *r = _mm256_setr_m128i (_mm512_cvtepi64_epi16 (_mm512_and_si512 (_01234567,
412+ _mm512_set1_epi64 (0xFF ))), _mm512_cvtepi64_epi16 (_mm512_and_si512 (
413+ _89abcdef, _mm512_set1_epi64 (0xFF ))));
414+ *g = _mm256_setr_m128i (_mm512_cvtepi64_epi16 (_mm512_and_si512 (_mm512_srli_epi64 (
415+ _01234567, 16 ), _mm512_set1_epi64 (0xFF ))), _mm512_cvtepi64_epi16 (
416+ _mm512_and_si512 (_mm512_srli_epi64 (_89abcdef, 16 ), _mm512_set1_epi64 (0xFF ))));
417+ *b = _mm256_setr_m128i (_mm512_cvtepi64_epi16 (_mm512_and_si512 (_mm512_srli_epi64 (
418+ _01234567, 32 ), _mm512_set1_epi64 (0xFF ))), _mm512_cvtepi64_epi16 (
419+ _mm512_and_si512 (_mm512_srli_epi64 (_89abcdef, 32 ), _mm512_set1_epi64 (0xFF ))));
420+ *a = _mm256_setr_m128i (_mm512_cvtepi64_epi16 (_mm512_and_si512 (_mm512_srli_epi64 (
421+ _01234567, 48 ), _mm512_set1_epi64 (0xFF ))), _mm512_cvtepi64_epi16 (
422+ _mm512_and_si512 (_mm512_srli_epi64 (_89abcdef, 48 ), _mm512_set1_epi64 (0xFF ))));
423+ }
424+ SI void store4 (uint16_t * ptr, U16 r, U16 g, U16 b, U16 a) {
425+ auto rg012389ab = _mm256_unpacklo_epi16 (r, g),
426+ rg4567cdef = _mm256_unpackhi_epi16 (r, g),
427+ ba012389ab = _mm256_unpacklo_epi16 (b, a),
428+ ba4567cdef = _mm256_unpackhi_epi16 (b, a);
429+
430+ auto _0189 = _mm256_unpacklo_epi32 (rg012389ab, ba012389ab),
431+ _23ab = _mm256_unpackhi_epi32 (rg012389ab, ba012389ab),
432+ _45cd = _mm256_unpacklo_epi32 (rg4567cdef, ba4567cdef),
433+ _67ef = _mm256_unpackhi_epi32 (rg4567cdef, ba4567cdef);
434+
435+ auto _ab23 = _mm256_permutex_epi64 (_23ab, 78 );
436+ auto _0123 = _mm256_blend_epi32 (_0189, _ab23, 0xf0 );
437+ auto _89ab = _mm256_permutex_epi64 (_mm256_blend_epi32 (_0189, _ab23, 0x0f ), 78 );
438+ auto _ef67 = _mm256_permutex_epi64 (_67ef, 78 );
439+ auto _4567 = _mm256_blend_epi32 (_45cd, _ef67, 0xf0 );
440+ auto _cdef = _mm256_permutex_epi64 (_mm256_blend_epi32 (_45cd, _ef67, 0x0f ), 78 );
441+
442+ _mm256_storeu_si256 ((__m256i*)ptr, _0123);
443+ _mm256_storeu_si256 ((__m256i*)ptr + 1 , _4567);
444+ _mm256_storeu_si256 ((__m256i*)ptr + 2 , _89ab);
445+ _mm256_storeu_si256 ((__m256i*)ptr + 3 , _cdef);
446+ }
447+
448+ SI void load4 (const float * ptr, F* r, F* g, F* b, F* a) {
449+ F _048c, _159d, _26ae, _37bf;
450+
451+ _048c = _mm512_castps128_ps512 (_mm_loadu_ps (ptr) );
452+ _048c = _mm512_insertf32x4 (_048c, _mm_loadu_ps (ptr+16 ), 1 );
453+ _048c = _mm512_insertf32x4 (_048c, _mm_loadu_ps (ptr+32 ), 2 );
454+ _048c = _mm512_insertf32x4 (_048c, _mm_loadu_ps (ptr+48 ), 3 );
455+ _159d = _mm512_castps128_ps512 (_mm_loadu_ps (ptr+4 ) );
456+ _159d = _mm512_insertf32x4 (_159d, _mm_loadu_ps (ptr+20 ), 1 );
457+ _159d = _mm512_insertf32x4 (_159d, _mm_loadu_ps (ptr+36 ), 2 );
458+ _159d = _mm512_insertf32x4 (_159d, _mm_loadu_ps (ptr+52 ), 3 );
459+ _26ae = _mm512_castps128_ps512 (_mm_loadu_ps (ptr+8 ) );
460+ _26ae = _mm512_insertf32x4 (_26ae, _mm_loadu_ps (ptr+24 ), 1 );
461+ _26ae = _mm512_insertf32x4 (_26ae, _mm_loadu_ps (ptr+40 ), 2 );
462+ _26ae = _mm512_insertf32x4 (_26ae, _mm_loadu_ps (ptr+56 ), 3 );
463+ _37bf = _mm512_castps128_ps512 (_mm_loadu_ps (ptr+12 ) );
464+ _37bf = _mm512_insertf32x4 (_37bf, _mm_loadu_ps (ptr+28 ), 1 );
465+ _37bf = _mm512_insertf32x4 (_37bf, _mm_loadu_ps (ptr+44 ), 2 );
466+ _37bf = _mm512_insertf32x4 (_37bf, _mm_loadu_ps (ptr+60 ), 3 );
467+
468+ F rg02468acf = _mm512_unpacklo_ps (_048c, _26ae),
469+ ba02468acf = _mm512_unpackhi_ps (_048c, _26ae),
470+ rg13579bde = _mm512_unpacklo_ps (_159d, _37bf),
471+ ba13579bde = _mm512_unpackhi_ps (_159d, _37bf);
472+
473+ *r = _mm512_unpacklo_ps (rg02468acf, rg13579bde);
474+ *g = _mm512_unpackhi_ps (rg02468acf, rg13579bde);
475+ *b = _mm512_unpacklo_ps (ba02468acf, ba13579bde);
476+ *a = _mm512_unpackhi_ps (ba02468acf, ba13579bde);
477+ }
478+
479+ SI void store4 (float * ptr, F r, F g, F b, F a) {
480+ F rg014589cd = _mm512_unpacklo_ps (r, g),
481+ rg2367abef = _mm512_unpackhi_ps (r, g),
482+ ba014589cd = _mm512_unpacklo_ps (b, a),
483+ ba2367abef = _mm512_unpackhi_ps (b, a);
484+
485+ F _048c = _mm512_unpacklo_pd (rg014589cd, ba014589cd), // r0 g0 b0 a0 4 8 c
486+ _26ae = _mm512_unpacklo_pd (rg2367abef, ba2367abef), // r2 g2 b2 a2 6 a e
487+ _159d = _mm512_unpackhi_pd (rg014589cd, ba014589cd), // r1 g1 b1 a1 5 9 d
488+ _37bf = _mm512_unpackhi_pd (rg2367abef, ba2367abef); // r3 g3 b3 a3 7 b f
489+
490+
491+ F _ae26 = _mm512_permutexvar_pd (_mm512_setr_epi64 (4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 ), _26ae),
492+ _bf37 = _mm512_permutexvar_pd (_mm512_setr_epi64 (4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 ), _37bf),
493+ _8c04 = _mm512_permutexvar_pd (_mm512_setr_epi64 (4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 ), _048c),
494+ _9d15 = _mm512_permutexvar_pd (_mm512_setr_epi64 (4 ,5 ,6 ,7 ,0 ,1 ,2 ,3 ), _159d),
495+
496+ _0426 = _mm512_permutex2var_pd (_048c, _mm512_setr_epi64 (0 ,1 ,2 ,3 ,12 ,13 ,14 ,15 ), _ae26),
497+ _1537 = _mm512_permutex2var_pd (_159d, _mm512_setr_epi64 (0 ,1 ,2 ,3 ,12 ,13 ,14 ,15 ), _bf37),
498+ _5173 = _mm512_permutex_pd (_1537, 176 ),
499+ _0123 = _mm512_permutex2var_pd (_0426, _mm512_setr_epi64 (0 ,1 ,10 ,11 ,4 ,5 ,14 ,15 ), _5173),
500+ _5476 = _mm512_permutex2var_pd (_5173, _mm512_setr_epi64 (0 ,1 ,10 ,11 ,4 ,5 ,14 ,15 ), _0426),
501+ _4567 = _mm512_permutex_pd (_5476, 176 ),
502+ _8cae = _mm512_permutex2var_pd (_8c04, _mm512_setr_epi64 (0 ,1 ,2 ,3 ,12 ,13 ,14 ,15 ), _26ae),
503+ _9dbf = _mm512_permutex2var_pd (_9d15, _mm512_setr_epi64 (0 ,1 ,2 ,3 ,12 ,13 ,14 ,15 ), _37bf),
504+ _d9fb = _mm512_permutex_pd (_9dbf, 176 ),
505+ _89ab = _mm512_permutex2var_pd (_8cae, _mm512_setr_epi64 (0 ,1 ,10 ,11 ,4 ,5 ,14 ,15 ), _d9fb),
506+ _dcfe = _mm512_permutex2var_pd (_d9fb, _mm512_setr_epi64 (0 ,1 ,10 ,11 ,4 ,5 ,14 ,15 ), _8cae),
507+ _cdef = _mm512_permutex_pd (_dcfe, 176 );
508+ _mm512_storeu_ps (ptr+0 , _0123);
509+ _mm512_storeu_ps (ptr+16 , _4567);
510+ _mm512_storeu_ps (ptr+32 , _89ab);
511+ _mm512_storeu_ps (ptr+48 , _cdef);
512+ }
294513
295514#elif defined(JUMPER_IS_HSW)
296515 // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -631,6 +850,12 @@ template <typename T> using V = T __attribute__((ext_vector_type(4)));
631850 SI U32 trunc_ (F v) { return (U32)v; }
632851 SI U32 expand (U16 v) { return (U32)v; }
633852 SI U32 expand (U8 v) { return (U32)v; }
853+ #elif defined (JUMPER_IS_AVX512)
854+ SI F cast (U32 v) { return _mm512_cvtepu32_ps (v); }
855+ SI F cast64 (U64 v) { return __builtin_convertvector ( v, F); }
856+ SI U32 trunc_ (F v) { return (U32)__builtin_convertvector ( v, I32); }
857+ SI U32 expand (U16 v) { return _mm512_cvtepu16_epi32 (v); }
858+ SI U32 expand (U8 v) { return _mm512_cvtepu8_epi32 (v); }
634859#else
635860 SI F cast (U32 v) { return __builtin_convertvector ((I32)v, F); }
636861 SI F cast64 (U64 v) { return __builtin_convertvector ( v, F); }
@@ -692,6 +917,9 @@ SI F from_half(U16 h) {
692917 && !defined (SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
693918 return vcvt_f32_f16 (h);
694919
920+ #elif defined(JUMPER_IS_AVX512)
921+ return _mm512_cvtph_ps (h);
922+
695923#elif defined(JUMPER_IS_HSW)
696924 return _mm256_cvtph_ps (h);
697925
@@ -713,6 +941,9 @@ SI U16 to_half(F f) {
713941 && !defined (SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
714942 return vcvt_f16_f32 (f);
715943
944+ #elif defined(JUMPER_IS_AVX512)
945+ return _mm512_cvtps_ph (f, _MM_FROUND_CUR_DIRECTION);
946+
716947#elif defined(JUMPER_IS_HSW)
717948 return _mm256_cvtps_ph (f, _MM_FROUND_CUR_DIRECTION);
718949
@@ -4162,7 +4393,7 @@ namespace lowp {
41624393
41634394#else // We are compiling vector code with Clang... let's make some lowp stages!
41644395
4165- #if defined(JUMPER_IS_HSW)
4396+ #if defined(JUMPER_IS_AVX512) || defined( JUMPER_IS_HSW)
41664397 using U8 = uint8_t __attribute__ ((ext_vector_type(16 )));
41674398 using U16 = uint16_t __attribute__ ((ext_vector_type(16 )));
41684399 using I16 = int16_t __attribute__ ((ext_vector_type(16 )));
@@ -4440,7 +4671,10 @@ SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
44404671
44414672// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
44424673SI F rcp_precise (F x) {
4443- #if defined(JUMPER_IS_HSW)
4674+ #if defined(JUMPER_IS_AVX512)
4675+ F e = _mm512_rcp14_ps (x);
4676+ return _mm512_fnmadd_ps (x, e, _mm512_set1_ps (2 .0f )) * e;
4677+ #elif defined(JUMPER_IS_HSW)
44444678 __m256 lo,hi;
44454679 split (x, &lo,&hi);
44464680 return join<F>(SK_OPTS_NS::rcp_precise (lo), SK_OPTS_NS::rcp_precise (hi));
@@ -4457,7 +4691,9 @@ SI F rcp_precise(F x) {
44574691#endif
44584692}
44594693SI F sqrt_ (F x) {
4460- #if defined(JUMPER_IS_HSW)
4694+ #if defined(JUMPER_IS_AVX512)
4695+ return _mm512_sqrt_ps (x);
4696+ #elif defined(JUMPER_IS_HSW)
44614697 __m256 lo,hi;
44624698 split (x, &lo,&hi);
44634699 return join<F>(_mm256_sqrt_ps (lo), _mm256_sqrt_ps (hi));
@@ -4492,6 +4728,8 @@ SI F floor_(F x) {
44924728 float32x4_t lo,hi;
44934729 split (x, &lo,&hi);
44944730 return join<F>(vrndmq_f32 (lo), vrndmq_f32 (hi));
4731+ #elif defined(JUMPER_IS_AVX512)
4732+ return _mm512_floor_ps (x);
44954733#elif defined(JUMPER_IS_HSW)
44964734 __m256 lo,hi;
44974735 split (x, &lo,&hi);
@@ -4512,7 +4750,9 @@ SI F floor_(F x) {
45124750// The result is a number on [-1, 1).
45134751// Note: on neon this is a saturating multiply while the others are not.
45144752SI I16 scaled_mult (I16 a, I16 b) {
4515- #if defined(JUMPER_IS_HSW)
4753+ #if defined(JUMPER_IS_AVX512)
4754+ return _mm256_mulhrs_epi16 (a, b);
4755+ #elif defined(JUMPER_IS_HSW)
45164756 return _mm256_mulhrs_epi16 (a, b);
45174757#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
45184758 return _mm_mulhrs_epi16 (a, b);
@@ -4786,7 +5026,25 @@ SI void store(T* ptr, V v) {
47865026 memcpy (ptr, &v, sizeof (v));
47875027}
47885028
4789- #if defined(JUMPER_IS_HSW)
5029+ #if defined(JUMPER_IS_AVX512)
5030+ template <typename V, typename T>
5031+ SI V gather (const T* ptr, U32 ix) {
5032+ return V{ ptr[ix[ 0 ]], ptr[ix[ 1 ]], ptr[ix[ 2 ]], ptr[ix[ 3 ]],
5033+ ptr[ix[ 4 ]], ptr[ix[ 5 ]], ptr[ix[ 6 ]], ptr[ix[ 7 ]],
5034+ ptr[ix[ 8 ]], ptr[ix[ 9 ]], ptr[ix[10 ]], ptr[ix[11 ]],
5035+ ptr[ix[12 ]], ptr[ix[13 ]], ptr[ix[14 ]], ptr[ix[15 ]], };
5036+ }
5037+
5038+ template <>
5039+ F gather (const float * ptr, U32 ix) {
5040+ return _mm512_i32gather_ps (ix, ptr, 4 );
5041+ }
5042+
5043+ template <>
5044+ U32 gather (const uint32_t * ptr, U32 ix) {
5045+ return _mm512_i32gather_epi32 (ix, ptr, 4 );
5046+ }
5047+ #elif defined(JUMPER_IS_HSW)
47905048 template <typename V, typename T>
47915049 SI V gather (const T* ptr, U32 ix) {
47925050 return V{ ptr[ix[ 0 ]], ptr[ix[ 1 ]], ptr[ix[ 2 ]], ptr[ix[ 3 ]],
@@ -4824,7 +5082,12 @@ SI void store(T* ptr, V v) {
48245082// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
48255083
48265084SI void from_8888 (U32 rgba, U16* r, U16* g, U16* b, U16* a) {
4827- #if defined(JUMPER_IS_HSW)
5085+ #if defined(JUMPER_IS_AVX512)
5086+ rgba = _mm512_permutexvar_epi64 (_mm512_setr_epi64 (0 ,1 ,4 ,5 ,2 ,3 ,6 ,7 ), rgba);
5087+ auto cast_U16 = [](U32 v) -> U16 {
5088+ return _mm256_packus_epi32 (_mm512_castsi512_si256 (v), _mm512_extracti64x4_epi64 (v, 1 ));
5089+ };
5090+ #elif defined(JUMPER_IS_HSW)
48285091 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
48295092 __m256i _01,_23;
48305093 split (rgba, &_01, &_23);
0 commit comments