diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 227de58aedc97..66ecbac761cb5 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -2840,8 +2840,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // When both addresses are not 16-byte aligned the CopyBlock instruction sequence starts with padding // str instruction. For example, when both addresses are 8-byte aligned the instruction sequence looks like // - // ldr D_simdReg1, [srcReg, #srcOffset] - // str D_simdReg1, [dstReg, #dstOffset] + // ldr X_intReg1, [srcReg, #srcOffset] + // str X_intReg1, [dstReg, #dstOffset] // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+8] // stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+8] // ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+40] @@ -2853,7 +2853,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) // be profitable). const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown && - (size >= FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment); + (size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment); bool shouldUse16ByteWideInstrs = false; @@ -2876,8 +2876,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) { // In order to use 16-byte instructions the JIT needs to adjust either srcOffset or dstOffset. // The JIT should use 16-byte loads and stores when the resulting sequence (incl. an additional add - // instruction) - // has fewer number of instructions. + // instruction) has fewer number of instructions. if (helper.InstructionCount(FP_REGSIZE_BYTES) + 1 < helper.InstructionCount(REGSIZE_BYTES)) { @@ -2937,51 +2936,31 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) const unsigned intRegCount = node->AvailableTempRegCount(RBM_ALLINT); - switch (intRegCount) + if (intRegCount >= 2) { - case 1: - intReg1 = node->GetSingleTempReg(RBM_ALLINT); - break; - case 2: - intReg1 = node->ExtractTempReg(RBM_ALLINT); - intReg2 = node->GetSingleTempReg(RBM_ALLINT); - break; - default: - break; + intReg1 = node->ExtractTempReg(RBM_ALLINT); + intReg2 = node->ExtractTempReg(RBM_ALLINT); } - - regNumber simdReg1 = REG_NA; - regNumber simdReg2 = REG_NA; - - const unsigned simdRegCount = node->AvailableTempRegCount(RBM_ALLFLOAT); - - switch (simdRegCount) + else if (intRegCount == 1) { - case 1: - simdReg1 = node->GetSingleTempReg(RBM_ALLFLOAT); - break; - case 2: - simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); - simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); - break; - default: - break; + intReg1 = node->GetSingleTempReg(RBM_ALLINT); + intReg2 = rsGetRsvdReg(); + } + else + { + intReg1 = rsGetRsvdReg(); } if (shouldUse16ByteWideInstrs) { + const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT); + const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT); + helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); } else { - if (intReg2 == REG_NA) - { - helper.Unroll(REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter()); - } - else - { - helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); - } + helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter()); } #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 1b549424f0fc4..052c3f93999a3 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1772,7 +1772,7 @@ class LinearScan : public LinearScanInterface // The following keep track of information about internal (temporary register) intervals // during the building of a single node. - static const int MaxInternalCount = 4; + static const int MaxInternalCount = 5; RefPosition* internalDefs[MaxInternalCount]; int internalCount = 0; bool setInternalRegsDelayFree; diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index ef7a2d0440aa4..f85cb81d9e8c3 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -694,36 +694,40 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) { buildInternalIntRegisterDefForNode(blkNode); #ifdef TARGET_ARM64 - const bool dstAddrMayNeedReg = dstAddr->isContained(); - const bool srcAddrMayNeedReg = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || - ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained()); + const bool canUseLoadStorePairIntRegsInstrs = (size >= 2 * REGSIZE_BYTES); - if (srcAddrMayNeedReg && dstAddrMayNeedReg) + if (canUseLoadStorePairIntRegsInstrs) { - // The following allocates an additional integer register in a case - // when a load instruction and a store instruction cannot be encoded. + // CodeGen can use ldp/stp instructions sequence. buildInternalIntRegisterDefForNode(blkNode); - // In this case, CodeGen will use a SIMD register for copying. - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - // And in case of a larger block size, two SIMD registers. - if (size >= 2 * REGSIZE_BYTES) - { - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - } } - else if (srcAddrMayNeedReg || dstAddrMayNeedReg) + + const bool isSrcAddrLocal = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) || + ((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr()); + const bool isDstAddrLocal = dstAddr->OperIsLocalAddr(); + + // CodeGen can use 16-byte SIMD ldp/stp for larger block sizes + // only when both source and destination base address registers have known alignment. + // This is the case, when both registers are either sp or fp. + bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES); + + // Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point + // whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal). + // Therefore, it must allocate an additional integer register anyway. + if (canUse16ByteWideInstrs) { - if (size >= 2 * REGSIZE_BYTES) - { - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - } - else - { - buildInternalIntRegisterDefForNode(blkNode); - } + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); } - else if (size >= 2 * REGSIZE_BYTES) + + const bool srcAddrMayNeedReg = + isSrcAddrLocal || ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained()); + const bool dstAddrMayNeedReg = isDstAddrLocal || dstAddr->isContained(); + + // The following allocates an additional integer register in a case + // when a load instruction and a store instruction cannot be encoded using offset + // from a corresponding base register. + if (srcAddrMayNeedReg && dstAddrMayNeedReg) { buildInternalIntRegisterDefForNode(blkNode); }