[Arm64] Don't use D-copies in CopyBlock (#63588)

* Increase the maximum number of internal registers allowd per node in src/coreclr/jit/lsra.h * Based on discussion in #63453 don't allocate a SIMD register pair if the JIT won't be able to use Q-copies in src/coreclr/jit/lsraarmarch.cpp * Update CodeGen to reflect that Q-copies should be used only when size >= 2 * FP_REGSIZE_BYTES and using of them makes the instruction sequence shorter in src/coreclr/jit/codegenarmarch.cpp * Update comment - we don't use D-copies after that change in src/coreclr/jit/codegenarmarch.cpp
dotnet · Jan 19, 2022 · 675c167 · 675c167
1 parent ac4c7fb
commit 675c167
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 64 deletions.
diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
@@ -2840,8 +2840,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
     // When both addresses are not 16-byte aligned the CopyBlock instruction sequence starts with padding
     // str instruction. For example, when both addresses are 8-byte aligned the instruction sequence looks like
     //
-    // ldr D_simdReg1, [srcReg, #srcOffset]
-    // str D_simdReg1, [dstReg, #dstOffset]
+    // ldr X_intReg1, [srcReg, #srcOffset]
+    // str X_intReg1, [dstReg, #dstOffset]
     // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+8]
     // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+8]
     // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+40]
@@ -2853,7 +2853,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
     // be profitable).
 
     const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown &&
-                                        (size >= FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
+                                        (size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
 
     bool shouldUse16ByteWideInstrs = false;
 
@@ -2876,8 +2876,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
         {
             // In order to use 16-byte instructions the JIT needs to adjust either srcOffset or dstOffset.
             // The JIT should use 16-byte loads and stores when the resulting sequence (incl. an additional add
-            // instruction)
-            // has fewer number of instructions.
+            // instruction) has fewer number of instructions.
 
             if (helper.InstructionCount(FP_REGSIZE_BYTES) + 1 < helper.InstructionCount(REGSIZE_BYTES))
             {
@@ -2937,51 +2936,31 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
 
     const unsigned intRegCount = node->AvailableTempRegCount(RBM_ALLINT);
 
-    switch (intRegCount)
+    if (intRegCount >= 2)
     {
-        case 1:
-            intReg1 = node->GetSingleTempReg(RBM_ALLINT);
-            break;
-        case 2:
-            intReg1 = node->ExtractTempReg(RBM_ALLINT);
-            intReg2 = node->GetSingleTempReg(RBM_ALLINT);
-            break;
-        default:
-            break;
+        intReg1 = node->ExtractTempReg(RBM_ALLINT);
+        intReg2 = node->ExtractTempReg(RBM_ALLINT);
     }
-
-    regNumber simdReg1 = REG_NA;
-    regNumber simdReg2 = REG_NA;
-
-    const unsigned simdRegCount = node->AvailableTempRegCount(RBM_ALLFLOAT);
-
-    switch (simdRegCount)
+    else if (intRegCount == 1)
     {
-        case 1:
-            simdReg1 = node->GetSingleTempReg(RBM_ALLFLOAT);
-            break;
-        case 2:
-            simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
-            simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);
-            break;
-        default:
-            break;
+        intReg1 = node->GetSingleTempReg(RBM_ALLINT);
+        intReg2 = rsGetRsvdReg();
+    }
+    else
+    {
+        intReg1 = rsGetRsvdReg();
     }
 
     if (shouldUse16ByteWideInstrs)
     {
+        const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
+        const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);
+
         helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
     }
     else
     {
-        if (intReg2 == REG_NA)
-        {
-            helper.Unroll(REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
-        }
-        else
-        {
-            helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
-        }
+        helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
     }
 #endif // TARGET_ARM64
 

diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h
@@ -1772,7 +1772,7 @@ class LinearScan : public LinearScanInterface
 
     // The following keep track of information about internal (temporary register) intervals
     // during the building of a single node.
-    static const int MaxInternalCount = 4;
+    static const int MaxInternalCount = 5;
     RefPosition*     internalDefs[MaxInternalCount];
     int              internalCount = 0;
     bool             setInternalRegsDelayFree;

diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp
@@ -694,36 +694,40 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
                 {
                     buildInternalIntRegisterDefForNode(blkNode);
 #ifdef TARGET_ARM64
-                    const bool dstAddrMayNeedReg = dstAddr->isContained();
-                    const bool srcAddrMayNeedReg = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
-                                                   ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
+                    const bool canUseLoadStorePairIntRegsInstrs = (size >= 2 * REGSIZE_BYTES);
 
-                    if (srcAddrMayNeedReg && dstAddrMayNeedReg)
+                    if (canUseLoadStorePairIntRegsInstrs)
                     {
-                        // The following allocates an additional integer register in a case
-                        // when a load instruction and a store instruction cannot be encoded.
+                        // CodeGen can use ldp/stp instructions sequence.
                         buildInternalIntRegisterDefForNode(blkNode);
-                        // In this case, CodeGen will use a SIMD register for copying.
-                        buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                        // And in case of a larger block size, two SIMD registers.
-                        if (size >= 2 * REGSIZE_BYTES)
-                        {
-                            buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                        }
                     }
-                    else if (srcAddrMayNeedReg || dstAddrMayNeedReg)
+
+                    const bool isSrcAddrLocal = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
+                                                ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr());
+                    const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();
+
+                    // CodeGen can use 16-byte SIMD ldp/stp for larger block sizes
+                    // only when both source and destination base address registers have known alignment.
+                    // This is the case, when both registers are either sp or fp.
+                    bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);
+
+                    // Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
+                    // whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
+                    // Therefore, it must allocate an additional integer register anyway.
+                    if (canUse16ByteWideInstrs)
                     {
-                        if (size >= 2 * REGSIZE_BYTES)
-                        {
-                            buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                            buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
-                        }
-                        else
-                        {
-                            buildInternalIntRegisterDefForNode(blkNode);
-                        }
+                        buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
+                        buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
                     }
-                    else if (size >= 2 * REGSIZE_BYTES)
+
+                    const bool srcAddrMayNeedReg =
+                        isSrcAddrLocal || ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
+                    const bool dstAddrMayNeedReg = isDstAddrLocal || dstAddr->isContained();
+
+                    // The following allocates an additional integer register in a case
+                    // when a load instruction and a store instruction cannot be encoded using offset
+                    // from a corresponding base register.
+                    if (srcAddrMayNeedReg && dstAddrMayNeedReg)
                     {
                         buildInternalIntRegisterDefForNode(blkNode);
                     }