Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Optimize const ShiftRightLogical for byte values on XArch #86841

Merged
merged 1 commit into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19799,7 +19799,7 @@ GenTree* Compiler::gtNewSimdBinOpNode(
simdBaseType = TYP_LONG;
}

assert(!varTypeIsByte(simdBaseType));
assert(!varTypeIsByte(simdBaseType) || op == GT_RSZ);

// "over shifting" is platform specific behavior. We will match the C# behavior
// this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot
Expand All @@ -19809,13 +19809,23 @@ GenTree* Compiler::gtNewSimdBinOpNode(

unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1;

GenTree* nonConstantByteShiftCountOp = NULL;

if (op2->IsCnsIntOrI())
{
op2->AsIntCon()->gtIconVal &= shiftCountMask;
}
else
{
op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask));

if (varTypeIsByte(simdBaseType))
{
// Save the intermediate "shiftCount & shiftCountMask" value as we will need it to
// calculate which bits we should mask off after the 32-bit shift we use for 8-bit values.
nonConstantByteShiftCountOp = fgMakeMultiUse(&op2);
}

op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, CORINFO_TYPE_INT,
16);
}
Expand Down Expand Up @@ -19908,6 +19918,34 @@ GenTree* Compiler::gtNewSimdBinOpNode(
assert(op == GT_RSZ);
intrinsic = NI_SSE2_ShiftRightLogical;
}

if (varTypeIsByte(simdBaseType))
{
assert(op == GT_RSZ);

// We don't have actual instructions for shifting bytes, so we'll emulate them
// by shifting 32-bit values and masking off the bits that should be zeroed.
GenTree* maskAmountOp;

if (op2->IsCnsIntOrI())
{
ssize_t shiftCount = op2->AsIntCon()->gtIconVal;
MihaZupan marked this conversation as resolved.
Show resolved Hide resolved
ssize_t mask = 255 >> shiftCount;

maskAmountOp = gtNewIconNode(mask, type);
}
else
{
assert(nonConstantByteShiftCountOp != NULL);

maskAmountOp = gtNewOperNode(GT_RSZ, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp);
}

GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize);
GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize);

return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize);
}
break;
}

Expand Down
6 changes: 0 additions & 6 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2636,12 +2636,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if (varTypeIsByte(simdBaseType))
{
// byte and sbyte would require more work to support
break;
}

if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
op2 = impPopStack().val;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -879,10 +879,10 @@ private static Vector128<byte> IndexOfAnyLookupCore(Vector128<byte> source, Vect

// On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values.
// The shift will map values above 127 to values above 16, which the shuffle will then map to 0.
// On X86 and WASM, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift.
// On X86 and WASM, use a logical right shift instead.
Vector128<byte> highNibbles = AdvSimd.IsSupported
? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte()
: (source.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)0xF);
: source >>> 4;

// The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle.
// Lookup the rows via the lower nibble and the column via the higher nibble.
Expand Down Expand Up @@ -913,7 +913,7 @@ private static Vector256<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
private static Vector256<byte> IndexOfAnyLookupCore(Vector256<byte> source, Vector256<byte> bitmapLookup)
{
// See comments in IndexOfAnyLookupCore(Vector128<byte>) above for more details.
Vector256<byte> highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF);
Vector256<byte> highNibbles = source >>> 4;
Vector256<byte> bitMask = Avx2.Shuffle(bitmapLookup, source);
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles);
Vector256<byte> result = bitMask & bitPositions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,9 @@ private static Vector256<byte> ContainsMask32CharsAvx2(Vector256<byte> charMapLo
[CompExactlyDependsOn(typeof(Avx2))]
private static Vector256<byte> IsCharBitSetAvx2(Vector256<byte> charMapLower, Vector256<byte> charMapUpper, Vector256<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector256<byte> highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15);
Vector256<byte> shifted = values >>> VectorizedIndexShift;

Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble);
Vector256<byte> bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), shifted);

Vector256<byte> index = values & Vector256.Create((byte)VectorizedIndexMask);
Vector256<byte> bitMaskLower = Avx2.Shuffle(charMapLower, index);
Expand Down Expand Up @@ -175,12 +174,9 @@ private static Vector128<byte> ContainsMask16Chars(Vector128<byte> charMapLower,
[CompExactlyDependsOn(typeof(PackedSimd))]
private static Vector128<byte> IsCharBitSet(Vector128<byte> charMapLower, Vector128<byte> charMapUpper, Vector128<byte> values)
{
// X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564
Vector128<byte> highNibble = Sse2.IsSupported
? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15)
: values >>> VectorizedIndexShift;
Vector128<byte> shifted = values >>> VectorizedIndexShift;

Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble);
Vector128<byte> bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), shifted);

Vector128<byte> index = values & Vector128.Create((byte)VectorizedIndexMask);
Vector128<byte> bitMask;
Expand Down