SIMD uint8x16 to 4 of uint32x4 with transpose? #348

ghost · 2016-08-12T19:05:07Z

Hello. I have question. How to drop uint8x16 to 4 of uint32x4 with transpose? Known that notation with rgba rgba rgba rgba to rrrr gggg bbbb aaaa. And backward four 32x4 to single uint8x16, again with transpose?

PeterJensen · 2016-08-15T16:42:32Z

So you want one uint8x16 turned into four uint32x4 like this:

input (uint8x16): rgba,rgba,rgba,rgba (r, g, b, and a are each 8 bits)

output 0 (uint32x4): r,r,r,r (each r is 32 bits)
output 1 (uint32x4): g,g,g,g (each g is 32 bits)
output 2 (uint32x4): b,b,b,b (each b is 32 bits)
output 3 (uint32x4): a,a,a,a (each a is 32 bits)

and vice-versa.

Is my understanding correct?

ghost · 2016-08-15T18:31:01Z

correct

PeterJensen · 2016-08-15T21:05:29Z

Maybe something like this will do the trick:

function to4xUint32x4(src, dst) {
  var zerox16 = SIMD.Uint8x16.splat(0);
  var res0    = SIMD.Uint8x16.shuffle(src, zerox16, 0, 16, 16, 16, 4, 16, 16, 16, 8, 16, 16, 16, 12, 16, 16, 16);
  var res1    = SIMD.Uint8x16.shuffle(src, zerox16, 1, 16, 16, 16, 5, 16, 16, 16, 9, 16, 16, 16, 13, 16, 16, 16);
  var res2    = SIMD.Uint8x16.shuffle(src, zerox16, 2, 16, 16, 16, 6, 16, 16, 16, 10, 16, 16, 16, 14, 16, 16, 16);
  var res3    = SIMD.Uint8x16.shuffle(src, zerox16, 3, 16, 16, 16, 7, 16, 16, 16, 11, 16, 16, 16, 15, 16, 16, 16);
  SIMD.Uint32x4.store(dst, 0, SIMD.Uint32x4.fromUint8x16Bits(res0));
  SIMD.Uint32x4.store(dst, 4, SIMD.Uint32x4.fromUint8x16Bits(res1));
  SIMD.Uint32x4.store(dst, 8, SIMD.Uint32x4.fromUint8x16Bits(res2));
  SIMD.Uint32x4.store(dst, 12, SIMD.Uint32x4.fromUint8x16Bits(res3));
}

// small test for the function above
var input  = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
var output = new Uint32Array(16);

to4xUint32x4(input, output);
console.log(output);

Note: I assume that the 'r's are in input lane 0, 4, 8, and 12. 'g's are in input lane 1, 5, 9, 13. Etc.

juj · 2016-08-16T19:48:12Z

How about the following?

var mask = SIMD.Uint32x4.splat(0xFF); // Constant: create outside the hot loop.

var input = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);

input = SIMD.Uint32x4.fromUint8x16Bits(input);  // No-op
var r = SIMD.Uint32x4.and(input, mask); // PAND: 1 cycle in port 0,1 or 5 on Intel Haswell and newer
var g = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 8), mask); // PSRLD: 1 cycle in port 0, and PAND for 1 cycle in port 0, 1 or 5, on Intel Haswell and newer.
var b = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 16), mask);
var a = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 24), mask);

console.log(r.toString()); // Prints SIMD.Uint32x4(0, 4, 8, 12)
console.log(g.toString()); // Prints SIMD.Uint32x4(1, 5, 9, 13)
console.log(b.toString()); // Prints SIMD.Uint32x4(2, 6, 10, 14)
console.log(a.toString()); // Prints SIMD.Uint32x4(3, 7, 11, 15)

On native SSE code, that would be expected to run in four cycles (of throughput cost) per iteration, since the and of a previous color component, and the shiftRightByScalar of the next color component can be scheduled parallel on different ports on Haswell and newer architectures.

ghost · 2016-08-19T14:08:42Z

Why you not create gitter? Also, how to revert to uint8, and as been?

PeterJensen · 2016-08-19T16:53:55Z

The reverse operation could be done like this:

function toUint8x16(src) {
  var res;

  var src0 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 0));
  var src1 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 4));
  var src2 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 8));
  var src3 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 12));

  res = src0;  // the 'r's are already where they need to be
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12,  1, 1));
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src2, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1,  1, 12, 1));
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src3, 1, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1,  1,  1, 12));

  return res;
}
// small test for the functions above
var input  = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
var output = new Uint32Array(16);

to4xUint32x4(input, output);
console.log(toUint8x16(output).toString()); // prints SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);

Not sure, what you mean by gitter?

ljharb · 2016-08-19T17:17:12Z

gitter.im - it's a chat room for repos. @acterhd sometimes maintainers don't want the increased cost of having to check yet another place for support.

juj · 2016-08-20T17:46:45Z

In @PeterJensen's reverse operation, the swizzles assume that the inputs are in uint8 range, and all those swizzles of lane 1 assume that they will be receiving zeroes, or the or operation will generate garbage?

I doubt that the above code patterns with proposed swizzles or shuffles will have good performance, since they use the kind of swizzle and shuffle patterns that do not exist in native SSE or NEON as a fast operation. Assuming that the lane 1 has the value of 0, the code

SIMD.Uint8x16.swizzle(src1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12,  1, 1)

is better written as

SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(SIMD.UInt32x4.fromUint8x16Bits(src1), 8))

which could map to the PSLLD instruction that is 1 throughput clock cycle of work.

PeterJensen · 2016-08-20T21:52:21Z

Thanks @juj much better!

The code can be simplified a bit more (fewer conversions), if the input values (srcx) are kept as Uint32x4 values. The complete function now looks like this:

function toUint8x16(src) {
  var res;

  var src0 = SIMD.Uint32x4.load(src, 0);
  var src1 = SIMD.Uint32x4.load(src, 4);
  var src2 = SIMD.Uint32x4.load(src, 8);
  var src3 = SIMD.Uint32x4.load(src, 12);

  res = SIMD.Uint8x16.fromUint32x4Bits(src0);  // the 'r's are already where they need to be
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src1, 8)));
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src2, 16)));
  res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src3, 24)));

  return res;
}

So a total of 3 shift and 3 or operations

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

SIMD uint8x16 to 4 of uint32x4 with transpose? #348

SIMD uint8x16 to 4 of uint32x4 with transpose? #348

ghost commented Aug 12, 2016 •

edited by ghost

Loading

PeterJensen commented Aug 15, 2016

ghost commented Aug 15, 2016

PeterJensen commented Aug 15, 2016 •

edited

Loading

juj commented Aug 16, 2016

ghost commented Aug 19, 2016 •

edited by ghost

Loading

PeterJensen commented Aug 19, 2016

ljharb commented Aug 19, 2016

juj commented Aug 20, 2016

PeterJensen commented Aug 20, 2016

SIMD uint8x16 to 4 of uint32x4 with transpose? #348

SIMD uint8x16 to 4 of uint32x4 with transpose? #348

Comments

ghost commented Aug 12, 2016 • edited by ghost Loading

PeterJensen commented Aug 15, 2016

ghost commented Aug 15, 2016

PeterJensen commented Aug 15, 2016 • edited Loading

juj commented Aug 16, 2016

ghost commented Aug 19, 2016 • edited by ghost Loading

PeterJensen commented Aug 19, 2016

ljharb commented Aug 19, 2016

juj commented Aug 20, 2016

PeterJensen commented Aug 20, 2016

ghost commented Aug 12, 2016 •

edited by ghost

Loading

PeterJensen commented Aug 15, 2016 •

edited

Loading

ghost commented Aug 19, 2016 •

edited by ghost

Loading