From 227f6e51e720ced3ca59621d9eaf4f522bf91ce5 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 1 Mar 2024 02:49:16 +0100 Subject: [PATCH 1/3] Use SIMD to copy nongc blocks --- src/coreclr/jit/codegenarm64.cpp | 66 ++++++++++++++++++++++++-------- src/coreclr/jit/lsraarmarch.cpp | 7 ++++ 2 files changed, 57 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 498f227e48d5be..2999ee4107c1e8 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3626,7 +3626,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) unsigned slots = layout->GetSlotCount(); // Temp register(s) used to perform the sequence of loads and stores. - regNumber tmpReg = cpObjNode->ExtractTempReg(); + regNumber tmpReg = cpObjNode->ExtractTempReg(RBM_ALLINT); regNumber tmpReg2 = REG_NA; assert(genIsValidIntReg(tmpReg)); @@ -3635,7 +3635,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) if (slots > 1) { - tmpReg2 = cpObjNode->GetSingleTempReg(); + tmpReg2 = cpObjNode->ExtractTempReg(RBM_ALLINT); assert(tmpReg2 != tmpReg); assert(genIsValidIntReg(tmpReg2)); assert(tmpReg2 != REG_WRITE_BARRIER_DST_BYREF); @@ -3682,26 +3682,60 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount(); + // We might also need SIMD regs if we have 4 or more continuous non-gc slots + // On ARM64, SIMD loads/stores provide 8-byte atomicity guarantees when aligned to 8 bytes. + regNumber tmpSimdReg1 = REG_NA; + regNumber tmpSimdReg2 = REG_NA; + if ((slots >= 4) && compiler->IsBaselineSimdIsaSupported()) + { + tmpSimdReg1 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT); + tmpSimdReg2 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT); + } + unsigned i = 0; while (i < slots) { if (!layout->IsGCPtr(i)) { - // Check if the next slot's type is also TYP_GC_NONE and use ldp/stp - if ((i + 1 < slots) && !layout->IsGCPtr(i + 1)) + // How many continuous non-gc slots do we have? + unsigned nonGcSlots = 0; + do { - emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_SRC_BYREF, - 2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); - emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_DST_BYREF, - 2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX); - ++i; // extra increment of i, since we are copying two items - } - else + nonGcSlots++; + i++; + } while ((i < slots) && !layout->IsGCPtr(i)); + + const regNumber srcReg = REG_WRITE_BARRIER_SRC_BYREF; + const regNumber dstReg = REG_WRITE_BARRIER_DST_BYREF; + while (nonGcSlots > 0) { - emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE, - INS_OPTS_POST_INDEX); - emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE, - INS_OPTS_POST_INDEX); + regNumber tmp1 = tmpReg; + regNumber tmp2 = tmpReg2; + emitAttr size = EA_8BYTE; + insOpts opts = INS_OPTS_POST_INDEX; + + // Copy at least two slots at a time + if (nonGcSlots >= 2) + { + // Do 4 slots at a time if SIMD is supported + if ((nonGcSlots >= 4) && compiler->IsBaselineSimdIsaSupported()) + { + // We need SIMD temp regs now + tmp1 = tmpSimdReg1; + tmp2 = tmpSimdReg2; + size = EA_16BYTE; + nonGcSlots -= 2; + } + nonGcSlots -= 2; + + emit->emitIns_R_R_R_I(INS_ldp, size, tmp1, tmp2, srcReg, EA_SIZE(size) * 2, opts); + emit->emitIns_R_R_R_I(INS_stp, size, tmp1, tmp2, dstReg, EA_SIZE(size) * 2, opts); + } + else + { + emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); + emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); + } } } else @@ -3709,8 +3743,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) // In the case of a GC-Pointer we'll call the ByRef write barrier helper genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE); gcPtrCount--; + i++; } - ++i; } assert(gcPtrCount == 0); } diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index 1df68f5f3f5749..0f5761c2d03335 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -698,6 +698,13 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) buildInternalIntRegisterDefForNode(blkNode, internalIntCandidates); } + if (size >= 4 * REGSIZE_BYTES && compiler->IsBaselineSimdIsaSupported()) + { + // We can use 128-bit SIMD ldp/stp for larger block sizes + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + // If we have a dest address we want it in RBM_WRITE_BARRIER_DST_BYREF. dstAddrRegMask = RBM_WRITE_BARRIER_DST_BYREF; From 8c32736562aaea14f3fb83a68af0d4ba8ba890dd Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 1 Mar 2024 02:57:28 +0100 Subject: [PATCH 2/3] fix bug --- src/coreclr/jit/codegenarm64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 2999ee4107c1e8..4b15fd25c95cf2 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3727,12 +3727,12 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) nonGcSlots -= 2; } nonGcSlots -= 2; - emit->emitIns_R_R_R_I(INS_ldp, size, tmp1, tmp2, srcReg, EA_SIZE(size) * 2, opts); emit->emitIns_R_R_R_I(INS_stp, size, tmp1, tmp2, dstReg, EA_SIZE(size) * 2, opts); } else { + nonGcSlots--; emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); } From 1efab4f1542c4a0356776c1e8a1d1242bf92c2bc Mon Sep 17 00:00:00 2001 From: EgorBo Date: Fri, 1 Mar 2024 03:04:16 +0100 Subject: [PATCH 3/3] Oops --- src/coreclr/jit/codegenarm64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 4b15fd25c95cf2..81370e6413835f 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -3733,8 +3733,8 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) else { nonGcSlots--; - emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); - emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, tmp2, EA_SIZE(size), opts); + emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, srcReg, EA_SIZE(size), opts); + emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, dstReg, EA_SIZE(size), opts); } } }