Skip to content

Commit

Permalink
[Arm64] Don't use D-copies in CopyBlock (#63588)
Browse files Browse the repository at this point in the history
* Increase the maximum number of internal registers allowd per node in src/coreclr/jit/lsra.h

* Based on discussion in #63453 don't allocate a SIMD register pair if the JIT won't be able to use Q-copies in src/coreclr/jit/lsraarmarch.cpp

* Update CodeGen to reflect that Q-copies should be used only when size >= 2 * FP_REGSIZE_BYTES and using of them makes the instruction sequence shorter in src/coreclr/jit/codegenarmarch.cpp

* Update comment - we don't use D-copies after that change in src/coreclr/jit/codegenarmarch.cpp
  • Loading branch information
echesakov authored Jan 19, 2022
1 parent ac4c7fb commit 675c167
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 64 deletions.
57 changes: 18 additions & 39 deletions src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2840,8 +2840,8 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// When both addresses are not 16-byte aligned the CopyBlock instruction sequence starts with padding
// str instruction. For example, when both addresses are 8-byte aligned the instruction sequence looks like
//
// ldr D_simdReg1, [srcReg, #srcOffset]
// str D_simdReg1, [dstReg, #dstOffset]
// ldr X_intReg1, [srcReg, #srcOffset]
// str X_intReg1, [dstReg, #dstOffset]
// ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+8]
// stp Q_simdReg1, Q_simdReg2, [dstReg, #dstOffset+8]
// ldp Q_simdReg1, Q_simdReg2, [srcReg, #srcOffset+40]
Expand All @@ -2853,7 +2853,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
// be profitable).

const bool canUse16ByteWideInstrs = isSrcRegAddrAlignmentKnown && isDstRegAddrAlignmentKnown &&
(size >= FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);
(size >= 2 * FP_REGSIZE_BYTES) && (srcRegAddrAlignment == dstRegAddrAlignment);

bool shouldUse16ByteWideInstrs = false;

Expand All @@ -2876,8 +2876,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
{
// In order to use 16-byte instructions the JIT needs to adjust either srcOffset or dstOffset.
// The JIT should use 16-byte loads and stores when the resulting sequence (incl. an additional add
// instruction)
// has fewer number of instructions.
// instruction) has fewer number of instructions.

if (helper.InstructionCount(FP_REGSIZE_BYTES) + 1 < helper.InstructionCount(REGSIZE_BYTES))
{
Expand Down Expand Up @@ -2937,51 +2936,31 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

const unsigned intRegCount = node->AvailableTempRegCount(RBM_ALLINT);

switch (intRegCount)
if (intRegCount >= 2)
{
case 1:
intReg1 = node->GetSingleTempReg(RBM_ALLINT);
break;
case 2:
intReg1 = node->ExtractTempReg(RBM_ALLINT);
intReg2 = node->GetSingleTempReg(RBM_ALLINT);
break;
default:
break;
intReg1 = node->ExtractTempReg(RBM_ALLINT);
intReg2 = node->ExtractTempReg(RBM_ALLINT);
}

regNumber simdReg1 = REG_NA;
regNumber simdReg2 = REG_NA;

const unsigned simdRegCount = node->AvailableTempRegCount(RBM_ALLFLOAT);

switch (simdRegCount)
else if (intRegCount == 1)
{
case 1:
simdReg1 = node->GetSingleTempReg(RBM_ALLFLOAT);
break;
case 2:
simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);
break;
default:
break;
intReg1 = node->GetSingleTempReg(RBM_ALLINT);
intReg2 = rsGetRsvdReg();
}
else
{
intReg1 = rsGetRsvdReg();
}

if (shouldUse16ByteWideInstrs)
{
const regNumber simdReg1 = node->ExtractTempReg(RBM_ALLFLOAT);
const regNumber simdReg2 = node->GetSingleTempReg(RBM_ALLFLOAT);

helper.Unroll(FP_REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
}
else
{
if (intReg2 == REG_NA)
{
helper.Unroll(REGSIZE_BYTES, intReg1, simdReg1, simdReg2, srcReg, dstReg, GetEmitter());
}
else
{
helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
}
helper.UnrollBaseInstrs(intReg1, intReg2, srcReg, dstReg, GetEmitter());
}
#endif // TARGET_ARM64

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1772,7 +1772,7 @@ class LinearScan : public LinearScanInterface

// The following keep track of information about internal (temporary register) intervals
// during the building of a single node.
static const int MaxInternalCount = 4;
static const int MaxInternalCount = 5;
RefPosition* internalDefs[MaxInternalCount];
int internalCount = 0;
bool setInternalRegsDelayFree;
Expand Down
52 changes: 28 additions & 24 deletions src/coreclr/jit/lsraarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -694,36 +694,40 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
{
buildInternalIntRegisterDefForNode(blkNode);
#ifdef TARGET_ARM64
const bool dstAddrMayNeedReg = dstAddr->isContained();
const bool srcAddrMayNeedReg = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
const bool canUseLoadStorePairIntRegsInstrs = (size >= 2 * REGSIZE_BYTES);

if (srcAddrMayNeedReg && dstAddrMayNeedReg)
if (canUseLoadStorePairIntRegsInstrs)
{
// The following allocates an additional integer register in a case
// when a load instruction and a store instruction cannot be encoded.
// CodeGen can use ldp/stp instructions sequence.
buildInternalIntRegisterDefForNode(blkNode);
// In this case, CodeGen will use a SIMD register for copying.
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
// And in case of a larger block size, two SIMD registers.
if (size >= 2 * REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
}
else if (srcAddrMayNeedReg || dstAddrMayNeedReg)

const bool isSrcAddrLocal = src->OperIs(GT_LCL_VAR, GT_LCL_FLD) ||
((srcAddrOrFill != nullptr) && srcAddrOrFill->OperIsLocalAddr());
const bool isDstAddrLocal = dstAddr->OperIsLocalAddr();

// CodeGen can use 16-byte SIMD ldp/stp for larger block sizes
// only when both source and destination base address registers have known alignment.
// This is the case, when both registers are either sp or fp.
bool canUse16ByteWideInstrs = isSrcAddrLocal && isDstAddrLocal && (size >= 2 * FP_REGSIZE_BYTES);

// Note that the SIMD registers allocation is speculative - LSRA doesn't know at this point
// whether CodeGen will use SIMD registers (i.e. if such instruction sequence will be more optimal).
// Therefore, it must allocate an additional integer register anyway.
if (canUse16ByteWideInstrs)
{
if (size >= 2 * REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
else
{
buildInternalIntRegisterDefForNode(blkNode);
}
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
}
else if (size >= 2 * REGSIZE_BYTES)

const bool srcAddrMayNeedReg =
isSrcAddrLocal || ((srcAddrOrFill != nullptr) && srcAddrOrFill->isContained());
const bool dstAddrMayNeedReg = isDstAddrLocal || dstAddr->isContained();

// The following allocates an additional integer register in a case
// when a load instruction and a store instruction cannot be encoded using offset
// from a corresponding base register.
if (srcAddrMayNeedReg && dstAddrMayNeedReg)
{
buildInternalIntRegisterDefForNode(blkNode);
}
Expand Down

0 comments on commit 675c167

Please sign in to comment.