Skip to content

Commit

Permalink
Optimize logical right shifts for byte values on XArch
Browse files Browse the repository at this point in the history
  • Loading branch information
MihaZupan committed Jul 8, 2023
1 parent b6266a6 commit a28e981
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 18 deletions.
40 changes: 39 additions & 1 deletion src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19799,7 +19799,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
simdBaseType = TYP_LONG;
}

assert(!varTypeIsByte(simdBaseType));
assert(!varTypeIsByte(simdBaseType) || op == GT_RSZ);

// "over shifting" is platform specific behavior. We will match the C# behavior
// this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot
Expand All @@ -19809,13 +19809,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(

unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;

GenTree* nonConstantByteShiftCountOp = NULL;

if (op2->IsCnsIntOrI())
{
op2->AsIntCon()->gtIconVal &= shiftCountMask;
}
else
{
op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask));

if (varTypeIsByte(simdBaseType))
{
// Save the intermediate "shiftCount & shiftCountMask" value as we will need it to
// calculate which bits we should mask off after the 32-bit shift we use for 8-bit values.
nonConstantByteShiftCountOp = fgMakeMultiUse(&op2);
}

op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, CORINFO_TYPE_INT,
16);
}
Expand Down Expand Up @@ -19908,6 +19918,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
assert(op == GT_RSZ);
intrinsic = NI_SSE2_ShiftRightLogical;
}

if (varTypeIsByte(simdBaseType))
{
assert(op == GT_RSZ);

// We don't have actual instructions for shifting bytes, so we'll emulate them
// by shifting 32-bit values and masking off the bits that should be zeroed.
GenTree* maskAmountOp;

if (op2->IsCnsIntOrI())
{
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
ssize_t mask = 255 >> shiftCount;

maskAmountOp = gtNewIconNode(mask, type);
}
else
{
assert(nonConstantByteShiftCountOp != NULL);

maskAmountOp = gtNewOperNode(GT_RSZ, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
}

GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
}
break;
}

Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2636,12 +2636,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if (varTypeIsByte(simdBaseType))
{
// byte and sbyte would require more work to support
break;
}

if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
op2 = impPopStack().val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -879,10 +879,10 @@ private static Vector128<byte> IndexOfAnyLookupCore(Vector128<byte> source, Vect

// On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values.
// The shift will map values above 127 to values above 16, which the shuffle will then map to 0.
// On X86 and WASM, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift.
// On X86 and WASM, use a logical right shift instead.
Vector128<byte> highNibbles = AdvSimd.IsSupported
? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte()
: (source.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)0xF);
: source >>> 4;

// The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
// Lookup the rows via the lower nibble and the column via the higher nibble.
Expand Down Expand Up @@ -913,7 +913,7 @@ private static Vector256<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
private static Vector256<byte> IndexOfAnyLookupCore(Vector256<byte> source, Vector256<byte> bitmapLookup)
{
// See comments in IndexOfAnyLookupCore(Vector128<byte>) above for more details.
Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
Vector256<byte> highNibbles = source >>> 4;
Vector256<byte> bitMask = Avx2.Shuffle(bitmapLookup, source);
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
Vector256<byte> result = bitMask & bitPositions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,9 @@ private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLo
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
Vector256<byte> shifted = values >>> VectorizedIndexShift;

Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), shifted);

Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
Expand Down Expand Up @@ -175,12 +174,9 @@ private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower,
[CompExactlyDependsOn(typeof(PackedSimd))]
private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector128<byte> highNibble = Sse2.IsSupported
? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
: values >>> VectorizedIndexShift;
Vector128<byte> shifted = values >>> VectorizedIndexShift;

Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), shifted);

Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
Vector128<byte> bitMask;
Expand Down

0 comments on commit a28e981

Please sign in to comment.