@@ -65,7 +65,7 @@ ESIMD_INLINE void cmk_write(ty *buf, uint32_t offset, simd<ty, size> v) {
6565// Function bitonic_exchange{1,2,4,8} compares and swaps elements with
6666// the particular strides
6767ESIMD_INLINE simd<uint32_t , BASE_SZ>
68- bitonic_exchange8 (simd<uint32_t , BASE_SZ> A, simd<ushort, 32 > flip) {
68+ bitonic_exchange8 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
6969 simd<uint32_t , BASE_SZ> B;
7070#pragma unroll
7171 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -80,7 +80,7 @@ bitonic_exchange8(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
8080}
8181
8282ESIMD_INLINE simd<uint32_t , BASE_SZ>
83- bitonic_exchange4 (simd<uint32_t , BASE_SZ> A, simd<ushort, 32 > flip) {
83+ bitonic_exchange4 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
8484 simd<uint32_t , BASE_SZ> B;
8585#pragma unroll
8686 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -109,7 +109,7 @@ bitonic_exchange4(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
109109// each mov copies four 64-bit data, which is 4X SIMD efficiency
110110// improvement over the straightforward implementation.
111111ESIMD_INLINE simd<uint32_t , BASE_SZ>
112- bitonic_exchange2 (simd<uint32_t , BASE_SZ> A, simd<ushort, 32 > flip) {
112+ bitonic_exchange2 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
113113 simd<uint32_t , BASE_SZ> B;
114114#pragma unroll
115115 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -124,7 +124,7 @@ bitonic_exchange2(simd<uint32_t, BASE_SZ> A, simd<ushort, 32> flip) {
124124}
125125
126126ESIMD_INLINE simd<uint32_t , BASE_SZ>
127- bitonic_exchange1 (simd<uint32_t , BASE_SZ> A, simd<ushort, 32 > flip) {
127+ bitonic_exchange1 (simd<uint32_t , BASE_SZ> A, simd_mask< 32 > flip) {
128128 simd<uint32_t , BASE_SZ> B;
129129#pragma unroll
130130 // each thread is handling 256-element chunk. Each iteration
@@ -219,8 +219,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
219219 // similar to bitonic_exchange{1,2,4,8}.
220220
221221 // exchange 8
222- simd<ushort, 32 > flip13 = esimd_unpack_mask<32 >(0xff00ff00 ); // (init_mask13);
223- simd<ushort, 32 > flip14 = esimd_unpack_mask<32 >(0x00ff00ff ); // (init_mask14);
222+ simd_mask< 32 > flip13 = esimd_unpack_mask<32 >(0xff00ff00 ); // (init_mask13);
223+ simd_mask< 32 > flip14 = esimd_unpack_mask<32 >(0x00ff00ff ); // (init_mask14);
224224 simd<uint32_t , BASE_SZ> B;
225225 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
226226 B.select <8 , 1 >(i) = A.select <8 , 1 >(i + 8 );
@@ -239,8 +239,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
239239 }
240240
241241 // exchange 4
242- simd<ushort, 32 > flip15 = esimd_unpack_mask<32 >(0xf0f0f0f0 ); // (init_mask15);
243- simd<ushort, 32 > flip16 = esimd_unpack_mask<32 >(0x0f0f0f0f ); // (init_mask16);
242+ simd_mask< 32 > flip15 = esimd_unpack_mask<32 >(0xf0f0f0f0 ); // (init_mask15);
243+ simd_mask< 32 > flip16 = esimd_unpack_mask<32 >(0x0f0f0f0f ); // (init_mask16);
244244#pragma unroll
245245 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
246246 auto MA = A.select <32 , 1 >(i).bit_cast_view <uint32_t , 4 , 8 >();
@@ -259,8 +259,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
259259 }
260260
261261 // exchange 2
262- simd<ushort, 32 > flip17 = esimd_unpack_mask<32 >(0xcccccccc ); // (init_mask17);
263- simd<ushort, 32 > flip18 = esimd_unpack_mask<32 >(0x33333333 ); // (init_mask18);
262+ simd_mask< 32 > flip17 = esimd_unpack_mask<32 >(0xcccccccc ); // (init_mask17);
263+ simd_mask< 32 > flip18 = esimd_unpack_mask<32 >(0x33333333 ); // (init_mask18);
264264#pragma unroll
265265 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
266266 auto MB = B.select <32 , 1 >(i).bit_cast_view <long long , 4 , 4 >();
@@ -279,8 +279,8 @@ ESIMD_INLINE void bitonic_merge(uint32_t offset, simd<uint32_t, BASE_SZ> &A,
279279 flip18);
280280 }
281281 // exchange 1
282- simd<ushort, 32 > flip19 = esimd_unpack_mask<32 >(0xaaaaaaaa ); // (init_mask19);
283- simd<ushort, 32 > flip20 = esimd_unpack_mask<32 >(0x55555555 ); // (init_mask20);
282+ simd_mask< 32 > flip19 = esimd_unpack_mask<32 >(0xaaaaaaaa ); // (init_mask19);
283+ simd_mask< 32 > flip20 = esimd_unpack_mask<32 >(0x55555555 ); // (init_mask20);
284284#pragma unroll
285285 // Each iteration compares and swaps 2 32-element chunks
286286 for (int i = 0 ; i < BASE_SZ; i += 32 ) {
@@ -323,28 +323,28 @@ ESIMD_INLINE void cmk_bitonic_sort_256(uint32_t *buf1, uint32_t *buf2,
323323 simd<uint32_t , BASE_SZ> B;
324324 A = cmk_read<uint32_t , BASE_SZ>(buf1, offset);
325325
326- simd<ushort, 32 > flip1 = esimd_unpack_mask<32 >(0x66666666 ); // (init_mask1);
326+ simd_mask< 32 > flip1 = esimd_unpack_mask<32 >(0x66666666 ); // (init_mask1);
327327
328328 simd<unsigned short , 32 > mask;
329329 // stage 0
330330 B = bitonic_exchange1 (A, flip1);
331331 // stage 1
332- simd<ushort, 32 > flip2 = esimd_unpack_mask<32 >(0x3c3c3c3c ); // (init_mask2);
333- simd<ushort, 32 > flip3 = esimd_unpack_mask<32 >(0x5a5a5a5a ); // (init_mask3);
332+ simd_mask< 32 > flip2 = esimd_unpack_mask<32 >(0x3c3c3c3c ); // (init_mask2);
333+ simd_mask< 32 > flip3 = esimd_unpack_mask<32 >(0x5a5a5a5a ); // (init_mask3);
334334 A = bitonic_exchange2 (B, flip2);
335335 B = bitonic_exchange1 (A, flip3);
336336 // stage 2
337- simd<ushort, 32 > flip4 = esimd_unpack_mask<32 >(0x0ff00ff0 ); // (init_mask4);
338- simd<ushort, 32 > flip5 = esimd_unpack_mask<32 >(0x33cc33cc ); // (init_mask5);
339- simd<ushort, 32 > flip6 = esimd_unpack_mask<32 >(0x55aa55aa ); // (init_mask6);
337+ simd_mask< 32 > flip4 = esimd_unpack_mask<32 >(0x0ff00ff0 ); // (init_mask4);
338+ simd_mask< 32 > flip5 = esimd_unpack_mask<32 >(0x33cc33cc ); // (init_mask5);
339+ simd_mask< 32 > flip6 = esimd_unpack_mask<32 >(0x55aa55aa ); // (init_mask6);
340340 A = bitonic_exchange4 (B, flip4);
341341 B = bitonic_exchange2 (A, flip5);
342342 A = bitonic_exchange1 (B, flip6);
343343 // stage 3
344- simd<ushort, 32 > flip7 = esimd_unpack_mask<32 >(0x00ffff00 ); // (init_mask7);
345- simd<ushort, 32 > flip8 = esimd_unpack_mask<32 >(0x0f0ff0f0 ); // (init_mask8);
346- simd<ushort, 32 > flip9 = esimd_unpack_mask<32 >(0x3333cccc ); // (init_mask9);
347- simd<ushort, 32 > flip10 = esimd_unpack_mask<32 >(0x5555aaaa ); // (init_mask10);
344+ simd_mask< 32 > flip7 = esimd_unpack_mask<32 >(0x00ffff00 ); // (init_mask7);
345+ simd_mask< 32 > flip8 = esimd_unpack_mask<32 >(0x0f0ff0f0 ); // (init_mask8);
346+ simd_mask< 32 > flip9 = esimd_unpack_mask<32 >(0x3333cccc ); // (init_mask9);
347+ simd_mask< 32 > flip10 = esimd_unpack_mask<32 >(0x5555aaaa ); // (init_mask10);
348348 B = bitonic_exchange8 (A, flip7);
349349 A = bitonic_exchange4 (B, flip8);
350350 B = bitonic_exchange2 (A, flip9);
0 commit comments