Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARM64: Use SIMD to copy nongc gaps in blocks with gc pointers #99140

Merged
merged 3 commits into from
Mar 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 50 additions & 16 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3626,7 +3626,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)
unsigned slots = layout->GetSlotCount();

// Temp register(s) used to perform the sequence of loads and stores.
regNumber tmpReg = cpObjNode->ExtractTempReg();
regNumber tmpReg = cpObjNode->ExtractTempReg(RBM_ALLINT);
regNumber tmpReg2 = REG_NA;

assert(genIsValidIntReg(tmpReg));
Expand All @@ -3635,7 +3635,7 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)

if (slots > 1)
{
tmpReg2 = cpObjNode->GetSingleTempReg();
tmpReg2 = cpObjNode->ExtractTempReg(RBM_ALLINT);
assert(tmpReg2 != tmpReg);
assert(genIsValidIntReg(tmpReg2));
assert(tmpReg2 != REG_WRITE_BARRIER_DST_BYREF);
Expand Down Expand Up @@ -3682,35 +3682,69 @@ void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode)
{
unsigned gcPtrCount = cpObjNode->GetLayout()->GetGCPtrCount();

// We might also need SIMD regs if we have 4 or more continuous non-gc slots
// On ARM64, SIMD loads/stores provide 8-byte atomicity guarantees when aligned to 8 bytes.
regNumber tmpSimdReg1 = REG_NA;
regNumber tmpSimdReg2 = REG_NA;
if ((slots >= 4) && compiler->IsBaselineSimdIsaSupported())
{
tmpSimdReg1 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT);
tmpSimdReg2 = cpObjNode->ExtractTempReg(RBM_ALLFLOAT);
}

unsigned i = 0;
while (i < slots)
{
if (!layout->IsGCPtr(i))
{
// Check if the next slot's type is also TYP_GC_NONE and use ldp/stp
if ((i + 1 < slots) && !layout->IsGCPtr(i + 1))
// How many continuous non-gc slots do we have?
unsigned nonGcSlots = 0;
do
{
emit->emitIns_R_R_R_I(INS_ldp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_SRC_BYREF,
2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX);
emit->emitIns_R_R_R_I(INS_stp, EA_8BYTE, tmpReg, tmpReg2, REG_WRITE_BARRIER_DST_BYREF,
2 * TARGET_POINTER_SIZE, INS_OPTS_POST_INDEX);
++i; // extra increment of i, since we are copying two items
}
else
nonGcSlots++;
i++;
} while ((i < slots) && !layout->IsGCPtr(i));

const regNumber srcReg = REG_WRITE_BARRIER_SRC_BYREF;
const regNumber dstReg = REG_WRITE_BARRIER_DST_BYREF;
while (nonGcSlots > 0)
{
emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_SRC_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmpReg, REG_WRITE_BARRIER_DST_BYREF, TARGET_POINTER_SIZE,
INS_OPTS_POST_INDEX);
regNumber tmp1 = tmpReg;
regNumber tmp2 = tmpReg2;
emitAttr size = EA_8BYTE;
insOpts opts = INS_OPTS_POST_INDEX;

// Copy at least two slots at a time
if (nonGcSlots >= 2)
{
// Do 4 slots at a time if SIMD is supported
if ((nonGcSlots >= 4) && compiler->IsBaselineSimdIsaSupported())
{
// We need SIMD temp regs now
tmp1 = tmpSimdReg1;
tmp2 = tmpSimdReg2;
size = EA_16BYTE;
nonGcSlots -= 2;
}
nonGcSlots -= 2;
emit->emitIns_R_R_R_I(INS_ldp, size, tmp1, tmp2, srcReg, EA_SIZE(size) * 2, opts);
emit->emitIns_R_R_R_I(INS_stp, size, tmp1, tmp2, dstReg, EA_SIZE(size) * 2, opts);
}
else
{
nonGcSlots--;
emit->emitIns_R_R_I(INS_ldr, EA_8BYTE, tmp1, srcReg, EA_SIZE(size), opts);
emit->emitIns_R_R_I(INS_str, EA_8BYTE, tmp1, dstReg, EA_SIZE(size), opts);
}
}
}
else
{
// In the case of a GC-Pointer we'll call the ByRef write barrier helper
genEmitHelperCall(CORINFO_HELP_ASSIGN_BYREF, 0, EA_PTRSIZE);
gcPtrCount--;
i++;
}
++i;
}
assert(gcPtrCount == 0);
}
Expand Down
7 changes: 7 additions & 0 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,13 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
buildInternalIntRegisterDefForNode(blkNode, internalIntCandidates);
}

if (size >= 4 * REGSIZE_BYTES && compiler->IsBaselineSimdIsaSupported())
{
// We can use 128-bit SIMD ldp/stp for larger block sizes
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}

// If we have a dest address we want it in RBM_WRITE_BARRIER_DST_BYREF.
dstAddrRegMask = RBM_WRITE_BARRIER_DST_BYREF;

Expand Down