From a28e9811e6e933fa8df6c8e9897957811ab2e511 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 8 Jul 2023 19:24:22 +0200 Subject: [PATCH] Optimize logical right shifts for byte values on XArch --- src/coreclr/jit/gentree.cpp | 40 ++++++++++++++++++- src/coreclr/jit/hwintrinsicxarch.cpp | 6 --- .../SearchValues/IndexOfAnyAsciiSearcher.cs | 6 +-- .../System/SearchValues/ProbabilisticMap.cs | 12 ++---- 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ee9be975c32d5..c5626dc9f3971 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19799,7 +19799,7 @@ GenTree* Compiler::gtNewSimdBinOpNode( simdBaseType = TYP_LONG; } - assert(!varTypeIsByte(simdBaseType)); + assert(!varTypeIsByte(simdBaseType) || op == GT_RSZ); // "over shifting" is platform specific behavior. We will match the C# behavior // this requires we mask with (sizeof(T) * 8) - 1 which ensures the shift cannot @@ -19809,6 +19809,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( unsigned shiftCountMask = (genTypeSize(simdBaseType) * 8) - 1; + GenTree* nonConstantByteShiftCountOp = NULL; + if (op2->IsCnsIntOrI()) { op2->AsIntCon()->gtIconVal &= shiftCountMask; @@ -19816,6 +19818,14 @@ GenTree* Compiler::gtNewSimdBinOpNode( else { op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask)); + + if (varTypeIsByte(simdBaseType)) + { + // Save the intermediate "shiftCount & shiftCountMask" value as we will need it to + // calculate which bits we should mask off after the 32-bit shift we use for 8-bit values. + nonConstantByteShiftCountOp = fgMakeMultiUse(&op2); + } + op2 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, NI_SSE2_ConvertScalarToVector128Int32, CORINFO_TYPE_INT, 16); } @@ -19908,6 +19918,34 @@ GenTree* Compiler::gtNewSimdBinOpNode( assert(op == GT_RSZ); intrinsic = NI_SSE2_ShiftRightLogical; } + + if (varTypeIsByte(simdBaseType)) + { + assert(op == GT_RSZ); + + // We don't have actual instructions for shifting bytes, so we'll emulate them + // by shifting 32-bit values and masking off the bits that should be zeroed. + GenTree* maskAmountOp; + + if (op2->IsCnsIntOrI()) + { + ssize_t shiftCount = op2->AsIntCon()->gtIconVal; + ssize_t mask = 255 >> shiftCount; + + maskAmountOp = gtNewIconNode(mask, type); + } + else + { + assert(nonConstantByteShiftCountOp != NULL); + + maskAmountOp = gtNewOperNode(GT_RSZ, TYP_INT, gtNewIconNode(255), nonConstantByteShiftCountOp); + } + + GenTree* shiftOp = gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, CORINFO_TYPE_INT, simdSize); + GenTree* maskOp = gtNewSimdCreateBroadcastNode(type, maskAmountOp, simdBaseJitType, simdSize); + + return gtNewSimdBinOpNode(GT_AND, type, shiftOp, maskOp, simdBaseJitType, simdSize); + } break; } diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 270dffb9d919c..b2a556a960d17 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2636,12 +2636,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - if (varTypeIsByte(simdBaseType)) - { - // byte and sbyte would require more work to support - break; - } - if ((simdSize != 32) || compOpportunisticallyDependsOn(InstructionSet_AVX2)) { op2 = impPopStack().val; diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs index 87de3a404b84e..4f34dab37363c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/IndexOfAnyAsciiSearcher.cs @@ -879,10 +879,10 @@ private static Vector128 IndexOfAnyLookupCore(Vector128 source, Vect // On ARM, we have an instruction for an arithmetic right shift of 1-byte signed values. // The shift will map values above 127 to values above 16, which the shuffle will then map to 0. - // On X86 and WASM, use a 4-byte value shift with AND 15 to emulate a 1-byte value logical shift. + // On X86 and WASM, use a logical right shift instead. Vector128 highNibbles = AdvSimd.IsSupported ? AdvSimd.ShiftRightArithmetic(source.AsSByte(), 4).AsByte() - : (source.AsInt32() >>> 4).AsByte() & Vector128.Create((byte)0xF); + : source >>> 4; // The bitmapLookup represents a 8x16 table of bits, indicating whether a character is present in the needle. // Lookup the rows via the lower nibble and the column via the higher nibble. @@ -913,7 +913,7 @@ private static Vector256 IndexOfAnyLookup(Vector private static Vector256 IndexOfAnyLookupCore(Vector256 source, Vector256 bitmapLookup) { // See comments in IndexOfAnyLookupCore(Vector128) above for more details. - Vector256 highNibbles = Vector256.ShiftRightLogical(source.AsInt32(), 4).AsByte() & Vector256.Create((byte)0xF); + Vector256 highNibbles = source >>> 4; Vector256 bitMask = Avx2.Shuffle(bitmapLookup, source); Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibbles); Vector256 result = bitMask & bitPositions; diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs index 8a0d6532382f5..f42e130afeb43 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs @@ -131,10 +131,9 @@ private static Vector256 ContainsMask32CharsAvx2(Vector256 charMapLo [CompExactlyDependsOn(typeof(Avx2))] private static Vector256 IsCharBitSetAvx2(Vector256 charMapLower, Vector256 charMapUpper, Vector256 values) { - // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 - Vector256 highNibble = (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector256.Create((byte)15); + Vector256 shifted = values >>> VectorizedIndexShift; - Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), highNibble); + Vector256 bitPositions = Avx2.Shuffle(Vector256.Create(0x8040201008040201).AsByte(), shifted); Vector256 index = values & Vector256.Create((byte)VectorizedIndexMask); Vector256 bitMaskLower = Avx2.Shuffle(charMapLower, index); @@ -175,12 +174,9 @@ private static Vector128 ContainsMask16Chars(Vector128 charMapLower, [CompExactlyDependsOn(typeof(PackedSimd))] private static Vector128 IsCharBitSet(Vector128 charMapLower, Vector128 charMapUpper, Vector128 values) { - // X86 doesn't have a logical right shift intrinsic for bytes: https://github.com/dotnet/runtime/issues/82564 - Vector128 highNibble = Sse2.IsSupported - ? (values.AsInt32() >>> VectorizedIndexShift).AsByte() & Vector128.Create((byte)15) - : values >>> VectorizedIndexShift; + Vector128 shifted = values >>> VectorizedIndexShift; - Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), highNibble); + Vector128 bitPositions = Vector128.ShuffleUnsafe(Vector128.Create(0x8040201008040201).AsByte(), shifted); Vector128 index = values & Vector128.Create((byte)VectorizedIndexMask); Vector128 bitMask;