Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Arm64] Don't use D-copies in CopyBlock #63588

Merged
merged 4 commits into from
Jan 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 18 additions & 39 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2840,8 +2840,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// When both addresses are not 16-byte aligned the CopyBlock instruction sequence starts with padding
// str instruction. For example, when both addresses are 8-byte aligned the instruction sequence looks like
//
// ldr D_simdReg1, [srcReg, #srcOffset]
// str D_simdReg1, [dstReg, #dstOffset]
// ldr X_intReg1, [srcReg, #srcOffset]
// str X_intReg1, [dstReg, #dstOffset]
// ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+8]
// stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+8]
// ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+40]
Expand All @@ -2853,7 +2853,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// be profitable).

const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown &&
(size >= FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
(size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
echesakov marked this conversation as resolved.
Show resolved Hide resolved

bool shouldUse16ByteWideInstrs = false;

Expand All @@ -2876,8 +2876,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
{
// In order to use 16-byte instructions the JIT needs to adjust either srcOffset or dstOffset.
// The JIT should use 16-byte loads and stores when the resulting sequence (incl. an additional add
// instruction)
// has fewer number of instructions.
// instruction) has fewer number of instructions.

if (helper.InstructionCount(FP_REGSIZE_BYTES) + 1 < helper.InstructionCount(REGSIZE_BYTES))
{
Expand Down Expand Up @@ -2937,51 +2936,31 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

const unsigned intRegCount = node->AvailableTempRegCount(RBM_ALLINT);

switch (intRegCount)
if (intRegCount >= 2)
{
case 1:
intReg1 = node->GetSingleTempReg(RBM_ALLINT);
break;
case 2:
intReg1 = node->ExtractTempReg(RBM_ALLINT);
intReg2 = node->GetSingleTempReg(RBM_ALLINT);
break;
default:
break;
intReg1 = node->ExtractTempReg(RBM_ALLINT);
intReg2 = node->ExtractTempReg(RBM_ALLINT);
}

regNumber simdReg1 = REG_NA;
regNumber simdReg2 = REG_NA;

const unsigned simdRegCount = node->AvailableTempRegCount(RBM_ALLFLOAT);

switch (simdRegCount)
else if (intRegCount == 1)
{
case 1:
simdReg1 = node->GetSingleTempReg(RBM_ALLFLOAT);
break;
case 2:
simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);
break;
default:
break;
intReg1 = node->GetSingleTempReg(RBM_ALLINT);
intReg2 = rsGetRsvdReg();
}
else
{
intReg1 = rsGetRsvdReg();
}

if (shouldUse16ByteWideInstrs)
{
const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);

helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
}
else
{
if (intReg2 == REG_NA)
{
helper.Unroll(REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
}
else
{
helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
}
helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
}
#endif // TARGET_ARM64

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1772,7 +1772,7 @@ class LinearScan : public LinearScanInterface

// The following keep track of information about internal (temporary register) intervals
// during the building of a single node.
static const int MaxInternalCount = 4;
static const int MaxInternalCount = 5;
RefPosition* internalDefs[MaxInternalCount];
int internalCount = 0;
bool setInternalRegsDelayFree;
Expand Down
52 changes: 28 additions & 24 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -694,36 +694,40 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
{
buildInternalIntRegisterDefForNode(blkNode);
#ifdef TARGET_ARM64
const bool dstAddrMayNeedReg = dstAddr->isContained();
const bool srcAddrMayNeedReg = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
const bool canUseLoadStorePairIntRegsInstrs = (size >= 2 * REGSIZE_BYTES);

if (srcAddrMayNeedReg && dstAddrMayNeedReg)
if (canUseLoadStorePairIntRegsInstrs)
{
// The following allocates an additional integer register in a case
// when a load instruction and a store instruction cannot be encoded.
// CodeGen can use ldp/stp instructions sequence.
buildInternalIntRegisterDefForNode(blkNode);
// In this case, CodeGen will use a SIMD register for copying.
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
// And in case of a larger block size, two SIMD registers.
if (size >= 2 * REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
}
else if (srcAddrMayNeedReg || dstAddrMayNeedReg)

const bool isSrcAddrLocal = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr());
const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes
// only when both source and destination base address registers have known alignment.
// This is the case, when both registers are either sp or fp.
bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);

// Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
// whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
// Therefore, it must allocate an additional integer register anyway.
if (canUse16ByteWideInstrs)
{
if (size >= 2 * REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
else
{
buildInternalIntRegisterDefForNode(blkNode);
}
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
else if (size >= 2 * REGSIZE_BYTES)

const bool srcAddrMayNeedReg =
isSrcAddrLocal || ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
const bool dstAddrMayNeedReg = isDstAddrLocal || dstAddr->isContained();

// The following allocates an additional integer register in a case
// when a load instruction and a store instruction cannot be encoded using offset
// from a corresponding base register.
if (srcAddrMayNeedReg && dstAddrMayNeedReg)
{
buildInternalIntRegisterDefForNode(blkNode);
}
Expand Down