From aa5539a35b04855b18464ea26a7ea36ae4ad8027 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 15 Apr 2023 17:22:26 +0200 Subject: [PATCH 1/4] Enable AVX-512 in importervectorization.cpp --- src/coreclr/jit/importervectorization.cpp | 141 +++++++--------------- 1 file changed, 41 insertions(+), 100 deletions(-) diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 9f7c6f3cefab2b..de213f04e83049 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -6,8 +6,8 @@ #pragma hdrstop #endif -// For now the max possible size is Vector256.Count * 2 -#define MaxPossibleUnrollSize 32 +// For now the max possible size is Vector512.Count * 2 +#define MaxPossibleUnrollSize 64 //------------------------------------------------------------------------ // importer_vectorization.cpp @@ -71,7 +71,7 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length) #if defined(FEATURE_HW_INTRINSICS) //------------------------------------------------------------------------ // impExpandHalfConstEqualsSIMD: Attempts to unroll and vectorize -// Equals against a constant WCHAR data for Length in [8..32] range +// Equals against a constant WCHAR data for Length in [8..64] range // using SIMD instructions. C# equivalent of what this function emits: // // bool IsTestString(ReadOnlySpan span) @@ -108,31 +108,20 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( { assert(len >= 8 && len <= MaxPossibleUnrollSize); - if (!IsBaselineSimdIsaSupported()) + const int byteLen = len * 2; + const int simdSize = (int)roundDownSIMDSize(byteLen); + if (byteLen > (simdSize * 2)) { - // We need baseline SIMD support at least + // Data is too big to be processed via two SIMD loads + // or baseline has no SIMD support return nullptr; } - - CorInfoType baseType = CORINFO_TYPE_NATIVEUINT; - - int simdSize; - var_types simdType; - - NamedIntrinsic niEquals; - - GenTreeVecCon* cnsVec1 = nullptr; - GenTreeVecCon* cnsVec2 = nullptr; - GenTree* toLowerVec1 = nullptr; - GenTree* toLowerVec2 = nullptr; - - // Optimization: don't use two vectors for Length == 8 or 16 - bool useSingleVector = false; + assert(simdSize >= 16 && isPow2(simdSize)); WCHAR cnsValue[MaxPossibleUnrollSize] = {}; WCHAR toLowerMask[MaxPossibleUnrollSize] = {}; - memcpy((UINT8*)cnsValue, (UINT8*)cns, len * sizeof(WCHAR)); + memcpy(cnsValue, cns, len * sizeof(WCHAR)); if ((cmpMode == OrdinalIgnoreCase) && !ConvertToLowerCase(cnsValue, toLowerMask, len)) { @@ -140,89 +129,23 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( return nullptr; } -#if defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_Vector256) && len >= 16) - { - // Handle [16..32] inputs via two Vector256 - assert(len >= 16 && len <= 32); - - simdSize = 32; - simdType = TYP_SIMD32; - - niEquals = NI_Vector256_op_Equality; - - // Special case: use a single vector for Length == 16 - useSingleVector = len == 16; - - cnsVec1 = gtNewVconNode(simdType, cnsValue); - cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 16); - - if (cmpMode == OrdinalIgnoreCase) - { - toLowerVec1 = gtNewVconNode(simdType, toLowerMask); - toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 16); - } - } - else -#endif // TARGET_XARCH - if (len <= 16) - { - // Handle [8..16] inputs via two Vector128 - assert(len >= 8 && len <= 16); - - simdSize = 16; - simdType = TYP_SIMD16; - - niEquals = NI_Vector128_op_Equality; - - // Special case: use a single vector for Length == 8 - useSingleVector = len == 8; - - cnsVec1 = gtNewVconNode(simdType, cnsValue); - cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 8); - - if (cmpMode == OrdinalIgnoreCase) - { - toLowerVec1 = gtNewVconNode(simdType, toLowerMask); - toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 8); - } - } - else - { - JITDUMP("impExpandHalfConstEqualsSIMD: No V256 support and data is too big for V128\n"); - // NOTE: We might consider using four V128 for ARM64 - return nullptr; - } - - GenTree* zero = gtNewZeroConNode(simdType); + const var_types simdType = getSIMDTypeForSize(simdSize); + const CorInfoType baseType = CORINFO_TYPE_NATIVEUINT; - GenTree* offset1 = gtNewIconNode(dataOffset, TYP_I_IMPL); - GenTree* offset2 = gtNewIconNode(dataOffset + len * sizeof(USHORT) - simdSize, TYP_I_IMPL); - GenTree* dataPtr1 = gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1); - GenTree* dataPtr2 = gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2); + GenTreeVecCon* cnsVec1 = gtNewVconNode(simdType, cnsValue); + GenTreeVecCon* cnsVec2 = gtNewVconNode(simdType, (BYTE*)cnsValue + byteLen - simdSize); - GenTree* vec1 = gtNewIndir(simdType, dataPtr1); - GenTree* vec2 = gtNewIndir(simdType, dataPtr2); - - // TODO-Unroll-CQ: Spill vec1 and vec2 for better pipelining, currently we end up emitting: - // - // vmovdqu xmm0, xmmword ptr [rcx+12] - // vpxor xmm0, xmm0, xmmword ptr[reloc @RWD00] - // vmovdqu xmm1, xmmword ptr [rcx+20] - // vpxor xmm1, xmm1, xmmword ptr[reloc @RWD16] - // - // While we should re-order them to be: - // - // vmovdqu xmm0, xmmword ptr [rcx+12] - // vmovdqu xmm1, xmmword ptr [rcx+20] - // vpxor xmm0, xmm0, xmmword ptr[reloc @RWD00] - // vpxor xmm1, xmm1, xmmword ptr[reloc @RWD16] - // + GenTree* offset1 = gtNewIconNode(dataOffset, TYP_I_IMPL); + GenTree* offset2 = gtNewIconNode(dataOffset + byteLen - simdSize, TYP_I_IMPL); + GenTree* vec1 = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1)); + GenTree* vec2 = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2)); if (cmpMode == OrdinalIgnoreCase) { // Apply ASCII-only ToLowerCase mask (bitwise OR 0x20 for all a-Z chars) - assert((toLowerVec1 != nullptr) && (toLowerVec2 != nullptr)); + GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask); + GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize); + vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize); vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize); } @@ -231,7 +154,25 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( GenTree* xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize); GenTree* xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize); GenTree* orr = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize); - return gtNewSimdHWIntrinsicNode(TYP_BOOL, useSingleVector ? xor1 : orr, zero, niEquals, baseType, simdSize); + + // Optimization: use a single load when byteLen equals simdSize. + // For code simplicity we always create nodes for two vectors case. + const bool useSingleVector = simdSize == byteLen; + return gtNewSimdCmpOpAllNode(GT_EQ, TYP_BOOL, useSingleVector ? xor1 : orr, gtNewZeroConNode(simdType), baseType, + simdSize); + + // Codegen example for byteLen=40 and OrdinalIgnoreCase mode with AVX: + // + // vmovups ymm0, ymmword ptr [rcx+0CH] + // vpor ymm0, ymm0, ymmword ptr [reloc @RWD00] + // vpxor ymm0, ymm0, ymmword ptr [reloc @RWD32] + // vmovups ymm1, ymmword ptr [rcx+28H] + // vpor ymm1, ymm1, ymmword ptr [reloc @RWD64] + // vpxor ymm1, ymm1, ymmword ptr [reloc @RWD96] + // vpor ymm0, ymm0, ymm1 + // vptest ymm0, ymm0 + // sete al + // movzx rax, al } #endif // defined(FEATURE_HW_INTRINSICS) @@ -491,7 +432,7 @@ GenTree* Compiler::impExpandHalfConstEquals(GenTreeLclVar* data, indirCmp = impExpandHalfConstEqualsSWAR(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode); } #if defined(FEATURE_HW_INTRINSICS) - else if (len <= 32) + else { indirCmp = impExpandHalfConstEqualsSIMD(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode); } From 01b9fba0fc20bcf1e703b925794fe3795678b4cc Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sat, 15 Apr 2023 18:55:15 +0200 Subject: [PATCH 2/4] Update importervectorization.cpp --- src/coreclr/jit/importervectorization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index de213f04e83049..744b0cc6f8621f 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -432,7 +432,7 @@ GenTree* Compiler::impExpandHalfConstEquals(GenTreeLclVar* data, indirCmp = impExpandHalfConstEqualsSWAR(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode); } #if defined(FEATURE_HW_INTRINSICS) - else + else if (IsBaselineSimdIsaSupported()) { indirCmp = impExpandHalfConstEqualsSIMD(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode); } From 14908aa3f40ba1a4caa4bb0c7715544bbf555891 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 16 Apr 2023 00:56:17 +0200 Subject: [PATCH 3/4] Update importervectorization.cpp --- src/coreclr/jit/importervectorization.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 744b0cc6f8621f..3f001c43dc380e 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -108,7 +108,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( { assert(len >= 8 && len <= MaxPossibleUnrollSize); - const int byteLen = len * 2; + const int byteLen = len * sizeof(WCHAR); const int simdSize = (int)roundDownSIMDSize(byteLen); if (byteLen > (simdSize * 2)) { @@ -116,12 +116,12 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( // or baseline has no SIMD support return nullptr; } - assert(simdSize >= 16 && isPow2(simdSize)); + assert(simdSize >= 16); WCHAR cnsValue[MaxPossibleUnrollSize] = {}; WCHAR toLowerMask[MaxPossibleUnrollSize] = {}; - memcpy(cnsValue, cns, len * sizeof(WCHAR)); + memcpy(cnsValue, cns, byteLen); if ((cmpMode == OrdinalIgnoreCase) && !ConvertToLowerCase(cnsValue, toLowerMask, len)) { From aa0ad5221eeff708ca98af393323ccb8aca064eb Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Sun, 16 Apr 2023 00:58:01 +0200 Subject: [PATCH 4/4] Update importervectorization.cpp --- src/coreclr/jit/importervectorization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 3f001c43dc380e..da313d5e339bdd 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -116,7 +116,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD( // or baseline has no SIMD support return nullptr; } - assert(simdSize >= 16); + assert((byteLen >= simdSize) && (simdSize >= 16)); WCHAR cnsValue[MaxPossibleUnrollSize] = {}; WCHAR toLowerMask[MaxPossibleUnrollSize] = {};