From aa5539a35b04855b18464ea26a7ea36ae4ad8027 Mon Sep 17 00:00:00 2001
From: EgorBo <egorbo@gmail.com>
Date: Sat, 15 Apr 2023 17:22:26 +0200
Subject: [PATCH 1/4] Enable AVX-512 in importervectorization.cpp

---
 src/coreclr/jit/importervectorization.cpp | 141 +++++++---------------
 1 file changed, 41 insertions(+), 100 deletions(-)
diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp
index 9f7c6f3cefab2b..de213f04e83049 100644
--- a/src/coreclr/jit/importervectorization.cpp
+++ b/src/coreclr/jit/importervectorization.cpp
@@ -6,8 +6,8 @@
 #pragma hdrstop
 #endif
 
-// For now the max possible size is Vector256<ushort>.Count * 2
-#define MaxPossibleUnrollSize 32
+// For now the max possible size is Vector512<ushort>.Count * 2
+#define MaxPossibleUnrollSize 64
 
 //------------------------------------------------------------------------
 // importer_vectorization.cpp
@@ -71,7 +71,7 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length)
 #if defined(FEATURE_HW_INTRINSICS)
 //------------------------------------------------------------------------
 // impExpandHalfConstEqualsSIMD: Attempts to unroll and vectorize
-//    Equals against a constant WCHAR data for Length in [8..32] range
+//    Equals against a constant WCHAR data for Length in [8..64] range
 //    using SIMD instructions. C# equivalent of what this function emits:
 //
 //    bool IsTestString(ReadOnlySpan<char> span)
@@ -108,31 +108,20 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
 {
     assert(len >= 8 && len <= MaxPossibleUnrollSize);
 
-    if (!IsBaselineSimdIsaSupported())
+    const int byteLen  = len * 2;
+    const int simdSize = (int)roundDownSIMDSize(byteLen);
+    if (byteLen > (simdSize * 2))
     {
-        // We need baseline SIMD support at least
+        // Data is too big to be processed via two SIMD loads
+        // or baseline has no SIMD support
         return nullptr;
     }
-
-    CorInfoType baseType = CORINFO_TYPE_NATIVEUINT;
-
-    int       simdSize;
-    var_types simdType;
-
-    NamedIntrinsic niEquals;
-
-    GenTreeVecCon* cnsVec1     = nullptr;
-    GenTreeVecCon* cnsVec2     = nullptr;
-    GenTree*       toLowerVec1 = nullptr;
-    GenTree*       toLowerVec2 = nullptr;
-
-    // Optimization: don't use two vectors for Length == 8 or 16
-    bool useSingleVector = false;
+    assert(simdSize >= 16 && isPow2(simdSize));
 
     WCHAR cnsValue[MaxPossibleUnrollSize]    = {};
     WCHAR toLowerMask[MaxPossibleUnrollSize] = {};
 
-    memcpy((UINT8*)cnsValue, (UINT8*)cns, len * sizeof(WCHAR));
+    memcpy(cnsValue, cns, len * sizeof(WCHAR));
 
     if ((cmpMode == OrdinalIgnoreCase) && !ConvertToLowerCase(cnsValue, toLowerMask, len))
     {
@@ -140,89 +129,23 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
         return nullptr;
     }
 
-#if defined(TARGET_XARCH)
-    if (compOpportunisticallyDependsOn(InstructionSet_Vector256) && len >= 16)
-    {
-        // Handle [16..32] inputs via two Vector256
-        assert(len >= 16 && len <= 32);
-
-        simdSize = 32;
-        simdType = TYP_SIMD32;
-
-        niEquals = NI_Vector256_op_Equality;
-
-        // Special case: use a single vector for Length == 16
-        useSingleVector = len == 16;
-
-        cnsVec1 = gtNewVconNode(simdType, cnsValue);
-        cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 16);
-
-        if (cmpMode == OrdinalIgnoreCase)
-        {
-            toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
-            toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 16);
-        }
-    }
-    else
-#endif // TARGET_XARCH
-        if (len <= 16)
-    {
-        // Handle [8..16] inputs via two Vector128
-        assert(len >= 8 && len <= 16);
-
-        simdSize = 16;
-        simdType = TYP_SIMD16;
-
-        niEquals = NI_Vector128_op_Equality;
-
-        // Special case: use a single vector for Length == 8
-        useSingleVector = len == 8;
-
-        cnsVec1 = gtNewVconNode(simdType, cnsValue);
-        cnsVec2 = gtNewVconNode(simdType, cnsValue + len - 8);
-
-        if (cmpMode == OrdinalIgnoreCase)
-        {
-            toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
-            toLowerVec2 = gtNewVconNode(simdType, toLowerMask + len - 8);
-        }
-    }
-    else
-    {
-        JITDUMP("impExpandHalfConstEqualsSIMD: No V256 support and data is too big for V128\n");
-        // NOTE: We might consider using four V128 for ARM64
-        return nullptr;
-    }
-
-    GenTree* zero = gtNewZeroConNode(simdType);
+    const var_types   simdType = getSIMDTypeForSize(simdSize);
+    const CorInfoType baseType = CORINFO_TYPE_NATIVEUINT;
 
-    GenTree* offset1  = gtNewIconNode(dataOffset, TYP_I_IMPL);
-    GenTree* offset2  = gtNewIconNode(dataOffset + len * sizeof(USHORT) - simdSize, TYP_I_IMPL);
-    GenTree* dataPtr1 = gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1);
-    GenTree* dataPtr2 = gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2);
+    GenTreeVecCon* cnsVec1 = gtNewVconNode(simdType, cnsValue);
+    GenTreeVecCon* cnsVec2 = gtNewVconNode(simdType, (BYTE*)cnsValue + byteLen - simdSize);
 
-    GenTree* vec1 = gtNewIndir(simdType, dataPtr1);
-    GenTree* vec2 = gtNewIndir(simdType, dataPtr2);
-
-    // TODO-Unroll-CQ: Spill vec1 and vec2 for better pipelining, currently we end up emitting:
-    //
-    //   vmovdqu  xmm0, xmmword ptr [rcx+12]
-    //   vpxor    xmm0, xmm0, xmmword ptr[reloc @RWD00]
-    //   vmovdqu  xmm1, xmmword ptr [rcx+20]
-    //   vpxor    xmm1, xmm1, xmmword ptr[reloc @RWD16]
-    //
-    // While we should re-order them to be:
-    //
-    //   vmovdqu  xmm0, xmmword ptr [rcx+12]
-    //   vmovdqu  xmm1, xmmword ptr [rcx+20]
-    //   vpxor    xmm0, xmm0, xmmword ptr[reloc @RWD00]
-    //   vpxor    xmm1, xmm1, xmmword ptr[reloc @RWD16]
-    //
+    GenTree* offset1 = gtNewIconNode(dataOffset, TYP_I_IMPL);
+    GenTree* offset2 = gtNewIconNode(dataOffset + byteLen - simdSize, TYP_I_IMPL);
+    GenTree* vec1    = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, data, offset1));
+    GenTree* vec2    = gtNewIndir(simdType, gtNewOperNode(GT_ADD, TYP_BYREF, gtClone(data), offset2));
 
     if (cmpMode == OrdinalIgnoreCase)
     {
         // Apply ASCII-only ToLowerCase mask (bitwise OR 0x20 for all a-Z chars)
-        assert((toLowerVec1 != nullptr) && (toLowerVec2 != nullptr));
+        GenTreeVecCon* toLowerVec1 = gtNewVconNode(simdType, toLowerMask);
+        GenTreeVecCon* toLowerVec2 = gtNewVconNode(simdType, (BYTE*)toLowerMask + byteLen - simdSize);
+
         vec1 = gtNewSimdBinOpNode(GT_OR, simdType, vec1, toLowerVec1, baseType, simdSize);
         vec2 = gtNewSimdBinOpNode(GT_OR, simdType, vec2, toLowerVec2, baseType, simdSize);
     }
@@ -231,7 +154,25 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
     GenTree* xor1 = gtNewSimdBinOpNode(GT_XOR, simdType, vec1, cnsVec1, baseType, simdSize);
     GenTree* xor2 = gtNewSimdBinOpNode(GT_XOR, simdType, vec2, cnsVec2, baseType, simdSize);
     GenTree* orr  = gtNewSimdBinOpNode(GT_OR, simdType, xor1, xor2, baseType, simdSize);
-    return gtNewSimdHWIntrinsicNode(TYP_BOOL, useSingleVector ? xor1 : orr, zero, niEquals, baseType, simdSize);
+
+    // Optimization: use a single load when byteLen equals simdSize.
+    // For code simplicity we always create nodes for two vectors case.
+    const bool useSingleVector = simdSize == byteLen;
+    return gtNewSimdCmpOpAllNode(GT_EQ, TYP_BOOL, useSingleVector ? xor1 : orr, gtNewZeroConNode(simdType), baseType,
+                                 simdSize);
+
+    // Codegen example for byteLen=40 and OrdinalIgnoreCase mode with AVX:
+    //
+    //  vmovups  ymm0, ymmword ptr [rcx+0CH]
+    //  vpor     ymm0, ymm0, ymmword ptr [reloc @RWD00]
+    //  vpxor    ymm0, ymm0, ymmword ptr [reloc @RWD32]
+    //  vmovups  ymm1, ymmword ptr [rcx+28H]
+    //  vpor     ymm1, ymm1, ymmword ptr [reloc @RWD64]
+    //  vpxor    ymm1, ymm1, ymmword ptr [reloc @RWD96]
+    //  vpor     ymm0, ymm0, ymm1
+    //  vptest   ymm0, ymm0
+    //  sete     al
+    //  movzx    rax, al
 }
 #endif // defined(FEATURE_HW_INTRINSICS)
 
@@ -491,7 +432,7 @@ GenTree* Compiler::impExpandHalfConstEquals(GenTreeLclVar*   data,
             indirCmp = impExpandHalfConstEqualsSWAR(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
         }
 #if defined(FEATURE_HW_INTRINSICS)
-        else if (len <= 32)
+        else
         {
             indirCmp = impExpandHalfConstEqualsSIMD(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
         }

From 01b9fba0fc20bcf1e703b925794fe3795678b4cc Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Sat, 15 Apr 2023 18:55:15 +0200
Subject: [PATCH 2/4] Update importervectorization.cpp

---
 src/coreclr/jit/importervectorization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp
index de213f04e83049..744b0cc6f8621f 100644
--- a/src/coreclr/jit/importervectorization.cpp
+++ b/src/coreclr/jit/importervectorization.cpp
@@ -432,7 +432,7 @@ GenTree* Compiler::impExpandHalfConstEquals(GenTreeLclVar*   data,
             indirCmp = impExpandHalfConstEqualsSWAR(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
         }
 #if defined(FEATURE_HW_INTRINSICS)
-        else
+        else if (IsBaselineSimdIsaSupported())
         {
             indirCmp = impExpandHalfConstEqualsSIMD(gtClone(data)->AsLclVar(), cnsData, len, dataOffset, cmpMode);
         }

From 14908aa3f40ba1a4caa4bb0c7715544bbf555891 Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Sun, 16 Apr 2023 00:56:17 +0200
Subject: [PATCH 3/4] Update importervectorization.cpp

---
 src/coreclr/jit/importervectorization.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp
index 744b0cc6f8621f..3f001c43dc380e 100644
--- a/src/coreclr/jit/importervectorization.cpp
+++ b/src/coreclr/jit/importervectorization.cpp
@@ -108,7 +108,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
 {
     assert(len >= 8 && len <= MaxPossibleUnrollSize);
 
-    const int byteLen  = len * 2;
+    const int byteLen  = len * sizeof(WCHAR);
     const int simdSize = (int)roundDownSIMDSize(byteLen);
     if (byteLen > (simdSize * 2))
     {
@@ -116,12 +116,12 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
         // or baseline has no SIMD support
         return nullptr;
     }
-    assert(simdSize >= 16 && isPow2(simdSize));
+    assert(simdSize >= 16);
 
     WCHAR cnsValue[MaxPossibleUnrollSize]    = {};
     WCHAR toLowerMask[MaxPossibleUnrollSize] = {};
 
-    memcpy(cnsValue, cns, len * sizeof(WCHAR));
+    memcpy(cnsValue, cns, byteLen);
 
     if ((cmpMode == OrdinalIgnoreCase) && !ConvertToLowerCase(cnsValue, toLowerMask, len))
     {

From aa0ad5221eeff708ca98af393323ccb8aca064eb Mon Sep 17 00:00:00 2001
From: Egor Bogatov <egorbo@gmail.com>
Date: Sun, 16 Apr 2023 00:58:01 +0200
Subject: [PATCH 4/4] Update importervectorization.cpp

---
 src/coreclr/jit/importervectorization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp
index 3f001c43dc380e..da313d5e339bdd 100644
--- a/src/coreclr/jit/importervectorization.cpp
+++ b/src/coreclr/jit/importervectorization.cpp
@@ -116,7 +116,7 @@ GenTree* Compiler::impExpandHalfConstEqualsSIMD(
         // or baseline has no SIMD support
         return nullptr;
     }
-    assert(simdSize >= 16);
+    assert((byteLen >= simdSize) && (simdSize >= 16));
 
     WCHAR cnsValue[MaxPossibleUnrollSize]    = {};
     WCHAR toLowerMask[MaxPossibleUnrollSize] = {};