Skip to content

Commit c9c9ff7

Browse files
committed
Added AVX512 support for SkRasterPipeline_opts.h
Hi, The patch has integrated AVX512 support for certain functions in both highp and lowp. Testing and verification were conducted within the Pdfium repository, where it passed the pdfium_embeddertests.exe. Performance-wise, the AVX512 code path shows significant enhancement over the standard SSE and AVX2 paths. This performance boost was confirmed through testing with PDF files sourced from the resources folder of the Pdfium library. This is an imported pull request from google#149 GitOrigin-RevId: 3dfeb3b Change-Id: I91f95a69d914ed57707239b7d2257a6c8f0c3ffa This is an imported pull request from google#151 GitOrigin-RevId: 02db57e Change-Id: Ia674977e3c1a083938bbfda1e9d785595896cb88
1 parent 5b3041f commit c9c9ff7

File tree

2 files changed

+270
-7
lines changed

2 files changed

+270
-7
lines changed

src/core/SkRasterPipelineOpContexts.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ namespace SkSL { class TraceHook; }
1919
// by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
2020
// save memory in the arena.
2121
inline static constexpr int SkRasterPipeline_kMaxStride = 16;
22-
inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;
22+
inline static constexpr int SkRasterPipeline_kMaxStride_highp = 16;
2323

2424
// How much space to allocate for each MemoryCtx scratch buffer, as part of tail-pixel handling.
2525
inline static constexpr size_t SkRasterPipeline_MaxScratchPerPatch =

src/opts/SkRasterPipeline_opts.h

Lines changed: 269 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ using NoCtx = const void*;
5959
#define JUMPER_IS_SCALAR
6060
#elif defined(SK_ARM_HAS_NEON)
6161
#define JUMPER_IS_NEON
62+
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
63+
#define JUMPER_IS_AVX512
6264
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
6365
#define JUMPER_IS_HSW
6466
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
@@ -291,6 +293,223 @@ namespace SK_OPTS_NS {
291293
SI void store4(float* ptr, F r, F g, F b, F a) {
292294
vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
293295
}
296+
#elif defined(JUMPER_IS_AVX512)
297+
template <typename T> using V = T __attribute__((ext_vector_type(16)));
298+
using F = V<float >;
299+
using I32 = V< int32_t>;
300+
using U64 = V<uint64_t>;
301+
using U32 = V<uint32_t>;
302+
using U16 = V<uint16_t>;
303+
using U8 = V<uint8_t >;
304+
305+
SI F mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
306+
SI F min(F a, F b) { return _mm512_min_ps(a,b); }
307+
SI I32 min(I32 a, I32 b) { return _mm512_min_epi32(a,b); }
308+
SI U32 min(U32 a, U32 b) { return _mm512_min_epu32(a,b); }
309+
SI F max(F a, F b) { return _mm512_max_ps(a,b); }
310+
SI I32 max(I32 a, I32 b) { return _mm512_max_epi32(a,b); }
311+
SI U32 max(U32 a, U32 b) { return _mm512_max_epu32(a,b); }
312+
SI F abs_ (F v) { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
313+
SI I32 abs_ (I32 v) { return _mm512_abs_epi32(v); }
314+
SI F floor_(F v) { return _mm512_floor_ps(v); }
315+
SI F ceil_(F v) { return _mm512_ceil_ps(v); }
316+
SI F rcp_approx(F v) { return _mm512_rcp14_ps (v); }
317+
SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps(v); }
318+
SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
319+
SI F rcp_precise (F v) {
320+
F e = rcp_approx(v);
321+
return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
322+
}
323+
SI U32 round(F v) { return _mm512_cvtps_epi32(v); }
324+
SI U32 round(F v, F scale) { return _mm512_cvtps_epi32(v*scale); }
325+
SI U16 pack(U32 v) {
326+
__m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256(v),
327+
_mm512_extracti64x4_epi64(v, 1));
328+
return _mm256_permutex_epi64(rst, 216);
329+
}
330+
SI U8 pack(U16 v) {
331+
__m256i rst = _mm256_packus_epi16(v, v);
332+
return _mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
333+
}
334+
SI F if_then_else(I32 c, F t, F e) {
335+
return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
336+
_mm512_castps_si512(e), 202));
337+
}
338+
SI bool any(I32 c) {
339+
__mmask16 mask32 = _mm512_test_epi32_mask(c, c);
340+
return mask32 != 0;
341+
}
342+
SI bool all(I32 c) {
343+
__mmask16 mask32 = _mm512_test_epi32_mask(c, c);
344+
return mask32 == 0xffff;
345+
}
346+
template <typename T>
347+
SI V<T> gather(const T* p, U32 ix) {
348+
return{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
349+
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
350+
p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
351+
p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
352+
};
353+
}
354+
SI F gather(const float* p, U32 ix) { return _mm512_i32gather_ps(ix, p, 4); }
355+
SI U32 gather(const uint32_t* p, U32 ix) { return _mm512_i32gather_epi32(ix, p, 4); }
356+
SI U64 gather(const uint64_t* p, U32 ix) {
357+
__m512i parts[] = {
358+
_mm512_i32gather_epi64(_mm512_castsi512_si256(ix), p, 8),
359+
_mm512_i32gather_epi64(_mm512_extracti32x8_epi32(ix, 1), p, 8),
360+
};
361+
return sk_bit_cast<U64>(parts);
362+
}
363+
template <typename V, typename S>
364+
SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
365+
V before = gather(dst, ix);
366+
V after = if_then_else(mask, src, before);
367+
dst[ix[0]] = after[0];
368+
dst[ix[1]] = after[1];
369+
dst[ix[2]] = after[2];
370+
dst[ix[3]] = after[3];
371+
dst[ix[4]] = after[4];
372+
dst[ix[5]] = after[5];
373+
dst[ix[6]] = after[6];
374+
dst[ix[7]] = after[7];
375+
dst[ix[8]] = after[8];
376+
dst[ix[9]] = after[9];
377+
dst[ix[10]] = after[10];
378+
dst[ix[11]] = after[11];
379+
dst[ix[12]] = after[12];
380+
dst[ix[13]] = after[13];
381+
dst[ix[14]] = after[14];
382+
dst[ix[15]] = after[15];
383+
}
384+
385+
SI void load2(const uint16_t* ptr, U16* r, U16* g) {
386+
U16 _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
387+
U16 _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);
388+
389+
*r = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
390+
(_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
391+
*g = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
392+
_mm256_srai_epi32(_89abcdef, 16)), 216);
393+
}
394+
SI void store2(uint16_t* ptr, U16 r, U16 g) {
395+
auto _01234567 = _mm256_unpacklo_epi16(r, g);
396+
auto _89abcdef = _mm256_unpackhi_epi16(r, g);
397+
__m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
398+
_89abcdef, 1);
399+
__m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
400+
_01234567 = _mm512_castsi512_si256(aa);
401+
_89abcdef = _mm512_extracti64x4_epi64(aa, 1);
402+
403+
_mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
404+
_mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
405+
}
406+
407+
SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
408+
__m512i _01234567 = _mm512_loadu_si512((__m512i*)ptr);
409+
__m512i _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));
410+
411+
*r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567,
412+
_mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
413+
_89abcdef, _mm512_set1_epi64(0xFF))));
414+
*g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
415+
_01234567, 16), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
416+
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
417+
*b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
418+
_01234567, 32), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
419+
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
420+
*a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
421+
_01234567, 48), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
422+
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
423+
}
424+
SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
425+
auto rg012389ab = _mm256_unpacklo_epi16(r, g),
426+
rg4567cdef = _mm256_unpackhi_epi16(r, g),
427+
ba012389ab = _mm256_unpacklo_epi16(b, a),
428+
ba4567cdef = _mm256_unpackhi_epi16(b, a);
429+
430+
auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
431+
_23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
432+
_45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
433+
_67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
434+
435+
auto _ab23 = _mm256_permutex_epi64(_23ab, 78);
436+
auto _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0);
437+
auto _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78);
438+
auto _ef67 = _mm256_permutex_epi64(_67ef, 78);
439+
auto _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0);
440+
auto _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
441+
442+
_mm256_storeu_si256((__m256i*)ptr, _0123);
443+
_mm256_storeu_si256((__m256i*)ptr + 1, _4567);
444+
_mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
445+
_mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
446+
}
447+
448+
SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
449+
F _048c, _159d, _26ae, _37bf;
450+
451+
_048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
452+
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
453+
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
454+
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
455+
_159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
456+
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
457+
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
458+
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
459+
_26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
460+
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
461+
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
462+
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
463+
_37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
464+
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
465+
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
466+
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
467+
468+
F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
469+
ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
470+
rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
471+
ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
472+
473+
*r = _mm512_unpacklo_ps(rg02468acf, rg13579bde);
474+
*g = _mm512_unpackhi_ps(rg02468acf, rg13579bde);
475+
*b = _mm512_unpacklo_ps(ba02468acf, ba13579bde);
476+
*a = _mm512_unpackhi_ps(ba02468acf, ba13579bde);
477+
}
478+
479+
SI void store4(float* ptr, F r, F g, F b, F a) {
480+
F rg014589cd = _mm512_unpacklo_ps(r, g),
481+
rg2367abef = _mm512_unpackhi_ps(r, g),
482+
ba014589cd = _mm512_unpacklo_ps(b, a),
483+
ba2367abef = _mm512_unpackhi_ps(b, a);
484+
485+
F _048c = _mm512_unpacklo_pd(rg014589cd, ba014589cd), // r0 g0 b0 a0 4 8 c
486+
_26ae = _mm512_unpacklo_pd(rg2367abef, ba2367abef), // r2 g2 b2 a2 6 a e
487+
_159d = _mm512_unpackhi_pd(rg014589cd, ba014589cd), // r1 g1 b1 a1 5 9 d
488+
_37bf = _mm512_unpackhi_pd(rg2367abef, ba2367abef); // r3 g3 b3 a3 7 b f
489+
490+
491+
F _ae26 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _26ae),
492+
_bf37 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _37bf),
493+
_8c04 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _048c),
494+
_9d15 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _159d),
495+
496+
_0426 = _mm512_permutex2var_pd(_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _ae26),
497+
_1537 = _mm512_permutex2var_pd(_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _bf37),
498+
_5173 = _mm512_permutex_pd(_1537, 176),
499+
_0123 = _mm512_permutex2var_pd(_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _5173),
500+
_5476 = _mm512_permutex2var_pd(_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _0426),
501+
_4567 = _mm512_permutex_pd(_5476, 176),
502+
_8cae = _mm512_permutex2var_pd(_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _26ae),
503+
_9dbf = _mm512_permutex2var_pd(_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _37bf),
504+
_d9fb = _mm512_permutex_pd(_9dbf, 176),
505+
_89ab = _mm512_permutex2var_pd(_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _d9fb),
506+
_dcfe = _mm512_permutex2var_pd(_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _8cae),
507+
_cdef = _mm512_permutex_pd(_dcfe, 176);
508+
_mm512_storeu_ps(ptr+0, _0123);
509+
_mm512_storeu_ps(ptr+16, _4567);
510+
_mm512_storeu_ps(ptr+32, _89ab);
511+
_mm512_storeu_ps(ptr+48, _cdef);
512+
}
294513

295514
#elif defined(JUMPER_IS_HSW)
296515
// These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -631,6 +850,12 @@ template <typename T> using V = T __attribute__((ext_vector_type(4)));
631850
SI U32 trunc_(F v) { return (U32)v; }
632851
SI U32 expand(U16 v) { return (U32)v; }
633852
SI U32 expand(U8 v) { return (U32)v; }
853+
#elif defined (JUMPER_IS_AVX512)
854+
SI F cast (U32 v) { return _mm512_cvtepu32_ps(v); }
855+
SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
856+
SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
857+
SI U32 expand(U16 v) { return _mm512_cvtepu16_epi32(v); }
858+
SI U32 expand(U8 v) { return _mm512_cvtepu8_epi32(v); }
634859
#else
635860
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
636861
SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
@@ -692,6 +917,9 @@ SI F from_half(U16 h) {
692917
&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
693918
return vcvt_f32_f16(h);
694919

920+
#elif defined(JUMPER_IS_AVX512)
921+
return _mm512_cvtph_ps(h);
922+
695923
#elif defined(JUMPER_IS_HSW)
696924
return _mm256_cvtph_ps(h);
697925

@@ -713,6 +941,9 @@ SI U16 to_half(F f) {
713941
&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
714942
return vcvt_f16_f32(f);
715943

944+
#elif defined(JUMPER_IS_AVX512)
945+
return _mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
946+
716947
#elif defined(JUMPER_IS_HSW)
717948
return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
718949

@@ -4162,7 +4393,7 @@ namespace lowp {
41624393

41634394
#else // We are compiling vector code with Clang... let's make some lowp stages!
41644395

4165-
#if defined(JUMPER_IS_HSW)
4396+
#if defined(JUMPER_IS_AVX512) || defined(JUMPER_IS_HSW)
41664397
using U8 = uint8_t __attribute__((ext_vector_type(16)));
41674398
using U16 = uint16_t __attribute__((ext_vector_type(16)));
41684399
using I16 = int16_t __attribute__((ext_vector_type(16)));
@@ -4440,7 +4671,10 @@ SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
44404671

44414672
// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
44424673
SI F rcp_precise(F x) {
4443-
#if defined(JUMPER_IS_HSW)
4674+
#if defined(JUMPER_IS_AVX512)
4675+
F e = _mm512_rcp14_ps(x);
4676+
return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
4677+
#elif defined(JUMPER_IS_HSW)
44444678
__m256 lo,hi;
44454679
split(x, &lo,&hi);
44464680
return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
@@ -4457,7 +4691,9 @@ SI F rcp_precise(F x) {
44574691
#endif
44584692
}
44594693
SI F sqrt_(F x) {
4460-
#if defined(JUMPER_IS_HSW)
4694+
#if defined(JUMPER_IS_AVX512)
4695+
return _mm512_sqrt_ps(x);
4696+
#elif defined(JUMPER_IS_HSW)
44614697
__m256 lo,hi;
44624698
split(x, &lo,&hi);
44634699
return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
@@ -4492,6 +4728,8 @@ SI F floor_(F x) {
44924728
float32x4_t lo,hi;
44934729
split(x, &lo,&hi);
44944730
return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
4731+
#elif defined(JUMPER_IS_AVX512)
4732+
return _mm512_floor_ps(x);
44954733
#elif defined(JUMPER_IS_HSW)
44964734
__m256 lo,hi;
44974735
split(x, &lo,&hi);
@@ -4512,7 +4750,9 @@ SI F floor_(F x) {
45124750
// The result is a number on [-1, 1).
45134751
// Note: on neon this is a saturating multiply while the others are not.
45144752
SI I16 scaled_mult(I16 a, I16 b) {
4515-
#if defined(JUMPER_IS_HSW)
4753+
#if defined(JUMPER_IS_AVX512)
4754+
return _mm256_mulhrs_epi16(a, b);
4755+
#elif defined(JUMPER_IS_HSW)
45164756
return _mm256_mulhrs_epi16(a, b);
45174757
#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
45184758
return _mm_mulhrs_epi16(a, b);
@@ -4786,7 +5026,25 @@ SI void store(T* ptr, V v) {
47865026
memcpy(ptr, &v, sizeof(v));
47875027
}
47885028

4789-
#if defined(JUMPER_IS_HSW)
5029+
#if defined(JUMPER_IS_AVX512)
5030+
template <typename V, typename T>
5031+
SI V gather(const T* ptr, U32 ix) {
5032+
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5033+
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5034+
ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5035+
ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5036+
}
5037+
5038+
template<>
5039+
F gather(const float* ptr, U32 ix) {
5040+
return _mm512_i32gather_ps(ix, ptr, 4);
5041+
}
5042+
5043+
template<>
5044+
U32 gather(const uint32_t* ptr, U32 ix) {
5045+
return _mm512_i32gather_epi32(ix, ptr, 4);
5046+
}
5047+
#elif defined(JUMPER_IS_HSW)
47905048
template <typename V, typename T>
47915049
SI V gather(const T* ptr, U32 ix) {
47925050
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
@@ -4824,7 +5082,12 @@ SI void store(T* ptr, V v) {
48245082
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
48255083

48265084
SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
4827-
#if defined(JUMPER_IS_HSW)
5085+
#if defined(JUMPER_IS_AVX512)
5086+
rgba = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), rgba);
5087+
auto cast_U16 = [](U32 v) -> U16 {
5088+
return _mm256_packus_epi32(_mm512_castsi512_si256(v), _mm512_extracti64x4_epi64(v, 1));
5089+
};
5090+
#elif defined(JUMPER_IS_HSW)
48285091
// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
48295092
__m256i _01,_23;
48305093
split(rgba, &_01, &_23);

0 commit comments

Comments
 (0)