Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Arm64] AdvSIMD LoadPairVector64 and LoadPairVector128 #45020

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
cbd01ca
Support mulx returning ValueTuple
CarolEidt Jun 12, 2020
7445ac0
Remove accidentially added build.out build-tests.out libs.out
echesakov Nov 19, 2020
a39417c
Add LoadPairVector64 and LoadPairVector128 in AdvSimd.cs AdvSimd.Plat…
echesakov Jul 9, 2020
5c200a0
Add LoadPairScalarVector64 in AdvSimd.cs AdvSimd.PlatformNotSupported.cs
echesakov Jul 9, 2020
41fda0d
Add LoadPairVector64NonTemporal and LoadPairVector128NonTemporal in A…
echesakov Jul 9, 2020
b1ba848
Add LoadPairScalarVector64NonTemporal in AdvSimd.cs AdvSimd.PlatformN…
echesakov Jul 9, 2020
c662e7f
Update System.Runtime.Intrinsics.cs
echesakov Jul 14, 2020
3608e63
Add LoadPairVector64 and LoadPairVector128 in hwintrinsiclistarm64.h
echesakov Jul 14, 2020
3f1f20a
Support LoadPairVector128/64 in Compiler::impHWIntrinsic in src/corec…
echesakov Nov 17, 2020
8873f70
Exclude changes affecting current code generation of Bmi2.MultiplyNoF…
echesakov Nov 19, 2020
a69244e
Support multireg hardware intrinsics on Arm64 in src/coreclr/src/jit/…
echesakov Nov 17, 2020
7b51ee7
Support LoadPairVector128/64 in Compiler::impSpecialIntrinsic in src/…
echesakov Nov 17, 2020
9828c6e
Support LoadPairVector128/64 in CodeGen::genHWIntrinsic in src/corecl…
echesakov Nov 17, 2020
b20717f
Extend LinearScan::BuildHWIntrinsic to support intrinsics returning v…
echesakov Nov 18, 2020
e6cb0d3
Add LoadPairVectorTest.template
echesakov Nov 18, 2020
4d808e3
Add LoadPairVector64 and LoadPairVector128 in GenerateTests.csx
echesakov Nov 18, 2020
3b97de6
Add .Take() in src/tests/JIT/HardwareIntrinsics/Arm/Shared/Helpers.cs…
echesakov Nov 19, 2020
d5aba6d
Update src/tests/JIT/HardwareIntrinsics/Arm/AdvSimd.Arm64/*
echesakov Nov 18, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/coreclr/src/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -711,13 +711,13 @@ int GenTree::GetRegisterDstCount(Compiler* compiler) const
}
#endif

#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
if (OperIs(GT_HWINTRINSIC))
if (OperIsHWIntrinsic())
{
assert(TypeGet() == TYP_STRUCT);
// TODO-ARM64-NYI: Support hardware intrinsics operating on multiple contiguous registers.
return 2;
}
#endif

if (OperIsScalarLocal())
{
return AsLclVar()->GetFieldCount(compiler);
Expand Down
35 changes: 24 additions & 11 deletions src/coreclr/src/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -7243,12 +7243,12 @@ inline bool GenTree::IsMultiRegNode() const
return true;
}
#endif // FEATURE_MULTIREG_RET
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
if (OperIs(GT_HWINTRINSIC))

if (OperIsHWIntrinsic())
{
return (TypeGet() == TYP_STRUCT);
}
#endif

if (IsMultiRegLclVar())
{
return true;
Expand Down Expand Up @@ -7291,13 +7291,13 @@ inline unsigned GenTree::GetMultiRegCount()
return AsCopyOrReload()->GetRegCount();
}
#endif // FEATURE_MULTIREG_RET
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
if (OperIs(GT_HWINTRINSIC))

if (OperIsHWIntrinsic())
{
assert(TypeGet() == TYP_STRUCT);
return 2;
}
#endif

if (OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR))
{
assert((gtFlags & GTF_VAR_MULTIREG) != 0);
Expand Down Expand Up @@ -7360,10 +7360,11 @@ inline regNumber GenTree::GetRegByIndex(int regIndex)
return AsCopyOrReload()->GetRegNumByIdx(regIndex);
}
#endif // FEATURE_MULTIREG_RET
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
#ifdef FEATURE_HW_INTRINSICS
if (OperIs(GT_HWINTRINSIC))
{
assert(regIndex == 1);
// TODO-ARM64-NYI: Support hardware intrinsics operating on multiple contiguous registers.
return AsHWIntrinsic()->GetOtherReg();
}
#endif
Expand Down Expand Up @@ -7417,15 +7418,27 @@ inline var_types GenTree::GetRegTypeByIndex(int regIndex)

#endif // FEATURE_MULTIREG_RET

#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
if (OperIs(GT_HWINTRINSIC))
if (OperIsHWIntrinsic())
{
assert(TypeGet() == TYP_STRUCT);

#ifdef TARGET_XARCH
// At this time, the only multi-reg HW intrinsics all return the type of their
// arguments. If this changes, we will need a way to record or determine this.
assert(TypeGet() == TYP_STRUCT);
return gtGetOp1()->TypeGet();
}
#elif defined(TARGET_ARM64)
if (AsHWIntrinsic()->gtSIMDSize == 16)
{
return TYP_SIMD16;
}
else
{
assert(AsHWIntrinsic()->gtSIMDSize == 8);
return TYP_SIMD8;
}
#endif
}

if (OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR))
{
if (TypeGet() == TYP_LONG)
Expand Down
32 changes: 23 additions & 9 deletions src/coreclr/src/jit/hwintrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -785,16 +785,30 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic,

if ((retType == TYP_STRUCT) && featureSIMD)
{
unsigned int sizeBytes;
baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);
retType = getSIMDTypeForSize(sizeBytes);
assert(sizeBytes != 0);

// We want to return early here for cases where retType was TYP_STRUCT as per method signature and
// rather than deferring the decision after getting the baseType of arg.
if (!isSupportedBaseType(intrinsic, baseType))
unsigned int sizeBytes = 0;
baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes);

if (sizeBytes != 0)
{
return nullptr;
// We want to return early here for cases where retType was TYP_STRUCT as per method signature and
// rather than deferring the decision after getting the baseType of arg.
if (!isSupportedBaseType(intrinsic, baseType))
{
return nullptr;
}

retType = getSIMDTypeForSize(sizeBytes);
}
else
{
#ifdef TARGET_ARM64
assert((intrinsic == NI_AdvSimd_Arm64_LoadPairScalarVector64) ||
(intrinsic == NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal) ||
(intrinsic == NI_AdvSimd_Arm64_LoadPairVector64) ||
(intrinsic == NI_AdvSimd_Arm64_LoadPairVector64NonTemporal) ||
(intrinsic == NI_AdvSimd_Arm64_LoadPairVector128) ||
(intrinsic == NI_AdvSimd_Arm64_LoadPairVector128NonTemporal));
#endif
}
}

Expand Down
18 changes: 16 additions & 2 deletions src/coreclr/src/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -502,10 +502,24 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

default:
case NI_AdvSimd_Arm64_LoadPairScalarVector64:
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector64:
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector128:
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
{
return nullptr;
assert(retType == TYP_STRUCT);
assert(numArgs == 1);
op1 = impPopStack().val;
GenTreeHWIntrinsic* hwintrin = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
hwintrin->SetLayout(typGetObjLayout(sig->retTypeSigClass));
retNode = hwintrin;
break;
}

default:
return nullptr;
}

return retNode;
Expand Down
12 changes: 12 additions & 0 deletions src/coreclr/src/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,18 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
}
break;

case NI_AdvSimd_Arm64_LoadPairVector128:
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector64:
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, node->GetOtherReg(), op1Reg);
break;

case NI_AdvSimd_Arm64_LoadPairScalarVector64:
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
GetEmitter()->emitIns_R_R_R(ins, emitTypeSize(intrin.baseType), targetReg, node->GetOtherReg(), op1Reg);
break;

case NI_AdvSimd_Store:
GetEmitter()->emitIns_R_R(ins, emitSize, op2Reg, op1Reg, opt);
break;
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/src/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,12 @@ HARDWARE_INTRINSIC(AdvSimd_Arm64, FusedMultiplySubtractBySelectedScalar,
HARDWARE_INTRINSIC(AdvSimd_Arm64, FusedMultiplySubtractScalarBySelectedScalar, 8, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmls, INS_fmls}, HW_Category_SIMDByIndexedElement, HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_SIMDScalar)
HARDWARE_INTRINSIC(AdvSimd_Arm64, InsertSelectedScalar, -1, 4, {INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins}, HW_Category_SIMD, HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_NoJmpTableIMM|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadAndReplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ld1r, INS_ld1r, INS_invalid, INS_ld1r}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairScalarVector64, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ldp, INS_ldp, INS_invalid, INS_invalid, INS_ldp, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairScalarVector64NonTemporal, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ldnp, INS_ldnp, INS_invalid, INS_invalid, INS_ldnp, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector64, 8, 1, {INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector64NonTemporal, 8, 1, {INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector128, 16, 1, {INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector128NonTemporal, 16, 1, {INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(AdvSimd_Arm64, Max, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmax}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd_Arm64, MaxAcross, -1, 1, {INS_smaxv, INS_umaxv, INS_smaxv, INS_umaxv, INS_smaxv, INS_umaxv, INS_invalid, INS_invalid, INS_fmaxv, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AdvSimd_Arm64, MaxNumber, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmaxnm}, HW_Category_SIMD, HW_Flag_Commutative)
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,7 @@ GenTree* Compiler::impAssignStructPtr(GenTree* destAddr,
}

assert(src->OperIs(GT_LCL_VAR, GT_LCL_FLD, GT_FIELD, GT_IND, GT_OBJ, GT_CALL, GT_MKREFANY, GT_RET_EXPR, GT_COMMA) ||
(src->TypeGet() != TYP_STRUCT && src->OperIsSimdOrHWintrinsic()));
src->OperIsSimdOrHWintrinsic());

var_types asgType = src->TypeGet();

Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/src/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6178,10 +6178,10 @@ bool Lowering::CheckMultiRegLclVar(GenTreeLclVar* lclNode, const ReturnTypeDesc*
}
}
#ifdef TARGET_XARCH
// For local stores on XARCH we only handle mismatched src/dest register count for
// calls of SIMD type. If the source was another lclVar similarly promoted, we would
// For local stores on XARCH we can't handle another lclVar source.
// If the source was another lclVar similarly promoted, we would
// have broken it into multiple stores.
if (lclNode->OperIs(GT_STORE_LCL_VAR) && !lclNode->gtGetOp1()->OperIs(GT_CALL))
if (lclNode->OperIs(GT_STORE_LCL_VAR) && lclNode->gtGetOp1()->OperIs(GT_LCL_VAR))
{
canEnregister = false;
}
Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/src/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,16 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
#endif
}
}
else if (src->OperIsHWIntrinsic())
Copy link
Contributor Author

@echesakov echesakov Nov 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@CarolEidt I think I need to implement a similar logic on Arm64 to avoid store-reload of returned SIMD values for ldp/ldnp as in the following example:

        2C404410          ldnp    s16, s17, [x0]
        FD000FB0          str     d16, [fp,#24]
        FD0013B1          str     d17, [fp,#32]
        FD400FA8          ldr     d8, [fp,#24]
        FD4013A9          ldr     d9, [fp,#32]

Is it correct understanding?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I believe that's correct. However, this path is only for the case where you wind up with a STORE_BLK, as opposed to a STORE_LCL_VAR. If the lhs is a multi-reg lclVar, you should have the latter.

{
assert(!blkNode->AsObj()->GetLayout()->HasGCPtr());
if (blkNode->OperIs(GT_STORE_OBJ))
{
blkNode->SetOper(GT_STORE_BLK);
}
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
ContainBlockStoreAddress(blkNode, size, dstAddr);
}
else
{
assert(src->OperIs(GT_IND, GT_LCL_VAR, GT_LCL_FLD));
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/src/jit/lsra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ void lsraAssignRegToTree(GenTree* tree, regNumber reg, unsigned regIdx)
putArg->SetRegNumByIdx(reg, regIdx);
}
#endif // FEATURE_ARG_SPLIT
#if defined(TARGET_XARCH) && defined(FEATURE_HW_INTRINSICS)
#ifdef FEATURE_HW_INTRINSICS
else if (tree->OperIs(GT_HWINTRINSIC))
{
assert(regIdx == 1);
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/src/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1068,7 +1068,7 @@ class LinearScan : public LinearScanInterface
var_types type = tree->TypeGet();
if (type == TYP_STRUCT)
{
assert(tree->OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR));
assert(tree->OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR) || tree->OperIsHWIntrinsic());
GenTreeLclVar* lclVar = tree->AsLclVar();
LclVarDsc* varDsc = compiler->lvaGetDesc(lclVar);
type = varDsc->GetRegisterType(lclVar);
Expand Down Expand Up @@ -1613,7 +1613,7 @@ class LinearScan : public LinearScanInterface
#endif // FEATURE_SIMD

#ifdef FEATURE_HW_INTRINSICS
int BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree);
int BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount);
#endif // FEATURE_HW_INTRINSICS

int BuildPutArgStk(GenTreePutArgStk* argNode);
Expand Down
34 changes: 31 additions & 3 deletions src/coreclr/src/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ int LinearScan::BuildNode(GenTree* tree)

#ifdef FEATURE_HW_INTRINSICS
case GT_HWINTRINSIC:
srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic());
srcCount = BuildHWIntrinsic(tree->AsHWIntrinsic(), &dstCount);
break;
#endif // FEATURE_HW_INTRINSICS

Expand Down Expand Up @@ -944,17 +944,39 @@ int LinearScan::BuildSIMD(GenTreeSIMD* simdTree)
//
// Arguments:
// tree - The GT_HWINTRINSIC node of interest
// pDstCount - OUT parameter - the number of registers defined for the given node
//
// Return Value:
// The number of sources consumed by this node.
//
int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount)
{
assert(pDstCount != nullptr);

const HWIntrinsic intrin(intrinsicTree);

int srcCount = 0;
int dstCount = intrinsicTree->IsValue() ? 1 : 0;

switch (intrinsicTree->gtHWIntrinsicId)
{
// TODO-Arm64-Cleanup: Hardware intrinsics that operate on multiple contigious registers
// would require more cases like these and, perhaps, we should look at a way to encode
// the information in the intrinsics table using declarative approach.
case NI_AdvSimd_Arm64_LoadPairScalarVector64:
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector64:
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector128:
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
dstCount = 2;
break;

default:
assert(intrinsicTree->TypeGet() != TYP_STRUCT);
break;
}

const bool hasImmediateOperand = HWIntrinsicInfo::HasImmediateOperand(intrin.id);

if (hasImmediateOperand && !HWIntrinsicInfo::NoJmpTableImm(intrin.id))
Expand Down Expand Up @@ -1197,15 +1219,21 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)

buildInternalRegisterUses();

if (dstCount == 1)
if ((dstCount == 1) || (dstCount == 2))
{
BuildDef(intrinsicTree);

if (dstCount == 2)
{
BuildDef(intrinsicTree, RBM_NONE, 1);
}
}
else
{
assert(dstCount == 0);
}

*pDstCount = dstCount;
return srcCount;
}
#endif
Expand Down
Loading