Skip to content

Commit

Permalink
Arm64: Implement LoadVector64x* and LoadVector128x* APIs (#92855)
Browse files Browse the repository at this point in the history
* Add APIs for LoadVector*x2

* Add implementation for LoadVector*x2

* Add APIs for LoadVector*x3

* Add implementation for LoadVector*x3

* Add APIs for LoadVector*x4

* Add implementation for LoadVector*x4

* Add test cases for LoadVectorx2, LoadVectorx3, LoadVectorx4

* minor rename

* REVERT: Add Debug.Assert(false) to make sure test runs

* Retain gtOtherReg rather than making it an array

* Revert "REVERT: Add Debug.Assert(false) to make sure test runs"

This reverts commit 92fb279.

* fix the test template

* fix the LoadVectorx4 template

* address review comment

* fix one more error in LoadVectorx4Test.template

* feedback by Bruce

* Rename the test case name

* Disable test for mono
  • Loading branch information
kunalspathak committed Oct 5, 2023
1 parent 0e8523e commit b17fa8f
Show file tree
Hide file tree
Showing 16 changed files with 2,224 additions and 554 deletions.
30 changes: 28 additions & 2 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,19 @@ unsigned GenTree::GetMultiRegCount(Compiler* comp) const
return 1;
}

#ifdef TARGET_ARM64
//-----------------------------------------------------------------------------------
// NeedsConsecutiveRegisters: Checks if this tree node needs consecutive registers
//
// Return Value:
// Returns if the tree needs consecutive registers.
//
bool GenTree::NeedsConsecutiveRegisters() const
{
return HWIntrinsicInfo::NeedsConsecutiveRegisters(AsHWIntrinsic()->GetHWIntrinsicId());
}
#endif

//---------------------------------------------------------------
// gtGetContainedRegMask: Get the reg mask of the node including
// contained nodes (recursive).
Expand Down Expand Up @@ -3438,7 +3451,7 @@ unsigned Compiler::gtHashValue(GenTree* tree)
hash += tree->AsHWIntrinsic()->GetSimdBaseType();
hash += tree->AsHWIntrinsic()->GetSimdSize();
hash += tree->AsHWIntrinsic()->GetAuxiliaryType();
hash += tree->AsHWIntrinsic()->GetOtherReg();
hash += tree->AsHWIntrinsic()->GetRegByIndex(1);
break;
#endif // FEATURE_HW_INTRINSICS

Expand Down Expand Up @@ -25538,11 +25551,24 @@ ClassLayout* GenTreeHWIntrinsic::GetLayout(Compiler* compiler) const
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector64:
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
case NI_AdvSimd_LoadVector64x2:
return compiler->typGetBlkLayout(16);

case NI_AdvSimd_Arm64_LoadPairVector128:
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
case NI_AdvSimd_Arm64_LoadVector128x2:
case NI_AdvSimd_LoadVector64x4:
return compiler->typGetBlkLayout(32);

case NI_AdvSimd_LoadVector64x3:
return compiler->typGetBlkLayout(24);

case NI_AdvSimd_Arm64_LoadVector128x3:
return compiler->typGetBlkLayout(48);

case NI_AdvSimd_Arm64_LoadVector128x4:
return compiler->typGetBlkLayout(64);

#endif // TARGET_ARM64

default:
Expand Down Expand Up @@ -25579,7 +25605,7 @@ void GenTreeHWIntrinsic::SetHWIntrinsicId(NamedIntrinsic intrinsicId)
{
return (op1->TypeGet() == op2->TypeGet()) && (op1->GetHWIntrinsicId() == op2->GetHWIntrinsicId()) &&
(op1->GetSimdBaseType() == op2->GetSimdBaseType()) && (op1->GetSimdSize() == op2->GetSimdSize()) &&
(op1->GetAuxiliaryType() == op2->GetAuxiliaryType()) && (op1->GetOtherReg() == op2->GetOtherReg()) &&
(op1->GetAuxiliaryType() == op2->GetAuxiliaryType()) && (op1->GetRegByIndex(1) == op2->GetRegByIndex(1)) &&
OperandsAreEqual(op1, op2);
}

Expand Down
67 changes: 60 additions & 7 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1836,6 +1836,10 @@ struct GenTree
// Sets the GTF flag equivalent for the regIndex'th register of a multi-reg node.
void SetRegSpillFlagByIdx(GenTreeFlags flags, int regIndex);

#ifdef TARGET_ARM64
bool NeedsConsecutiveRegisters() const;
#endif

// Last-use information for either GenTreeLclVar or GenTreeCopyOrReload nodes.
private:
GenTreeFlags GetLastUseBit(int regIndex) const;
Expand Down Expand Up @@ -3646,7 +3650,7 @@ struct GenTreeLclVar : public GenTreeLclVarCommon
}
else
{
gtOtherReg[regIndex - 1] = regNumberSmall(reg);
gtOtherReg[regIndex - 1] = (regNumberSmall)reg;
}
}

Expand Down Expand Up @@ -6085,15 +6089,66 @@ struct GenTreeJitIntrinsic : public GenTreeMultiOp
NamedIntrinsic gtHWIntrinsicId;

public:
regNumber GetOtherReg() const
//-----------------------------------------------------------
// GetRegNumByIdx: Get regNumber of i'th position.
//
// Arguments:
// idx - register position.
//
// Return Value:
// Returns regNumber assigned to i'th position.
//
regNumber GetRegNumByIdx(unsigned idx) const
{
#ifdef TARGET_ARM64
assert(idx < MAX_MULTIREG_COUNT);

if (idx == 0)
{
return GetRegNum();
}

if (NeedsConsecutiveRegisters())
{
assert(IsMultiRegNode());
return (regNumber)(GetRegNum() + idx);
}
#endif
// should only be used to get otherReg
assert(idx == 1);
return (regNumber)gtOtherReg;
}

void SetOtherReg(regNumber reg)
//-----------------------------------------------------------
// SetRegNumByIdx: Set the regNumber for i'th position.
//
// Arguments:
// reg - reg number
// idx - register position.
//
// Return Value:
// None.
//
void SetRegNumByIdx(regNumber reg, unsigned idx)
{
#ifdef TARGET_ARM64
assert(idx < MAX_MULTIREG_COUNT);

if (idx == 0)
{
SetRegNum(reg);
return;
}
if (NeedsConsecutiveRegisters())
{
assert(IsMultiRegNode());
assert(reg == (regNumber)(GetRegNum() + idx));
return;
}
#endif
// should only be used to set otherReg
assert(idx == 1);
gtOtherReg = (regNumberSmall)reg;
assert(gtOtherReg == reg);
}

GenTreeFlags GetRegSpillFlagByIdx(unsigned idx) const
Expand Down Expand Up @@ -9323,9 +9378,7 @@ inline regNumber GenTree::GetRegByIndex(int regIndex) const
#ifdef FEATURE_HW_INTRINSICS
if (OperIs(GT_HWINTRINSIC))
{
assert(regIndex == 1);
// TODO-ARM64-NYI: Support hardware intrinsics operating on multiple contiguous registers.
return AsHWIntrinsic()->GetOtherReg();
return AsHWIntrinsic()->GetRegNumByIdx(regIndex);
}
#endif // FEATURE_HW_INTRINSICS

Expand Down
10 changes: 10 additions & 0 deletions src/coreclr/jit/hwintrinsic.h
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,17 @@ struct HWIntrinsicInfo
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector128:
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
case NI_AdvSimd_LoadVector64x2:
case NI_AdvSimd_Arm64_LoadVector128x2:
return 2;

case NI_AdvSimd_LoadVector64x3:
case NI_AdvSimd_Arm64_LoadVector128x3:
return 3;

case NI_AdvSimd_LoadVector64x4:
case NI_AdvSimd_Arm64_LoadVector128x4:
return 4;
#endif

#ifdef TARGET_XARCH
Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1877,6 +1877,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,
break;
}

case NI_AdvSimd_LoadVector64x2:
case NI_AdvSimd_LoadVector64x3:
case NI_AdvSimd_LoadVector64x4:
case NI_AdvSimd_Arm64_LoadVector128x2:
case NI_AdvSimd_Arm64_LoadVector128x3:
case NI_AdvSimd_Arm64_LoadVector128x4:
info.compNeedsConsecutiveRegisters = true;
FALLTHROUGH;
case NI_AdvSimd_Arm64_LoadPairScalarVector64:
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector128:
Expand Down
5 changes: 3 additions & 2 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,12 +732,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_AdvSimd_Arm64_LoadPairVector128NonTemporal:
case NI_AdvSimd_Arm64_LoadPairVector64:
case NI_AdvSimd_Arm64_LoadPairVector64NonTemporal:
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, node->GetOtherReg(), op1Reg);
GetEmitter()->emitIns_R_R_R(ins, emitSize, targetReg, node->GetRegByIndex(1), op1Reg);
break;

case NI_AdvSimd_Arm64_LoadPairScalarVector64:
case NI_AdvSimd_Arm64_LoadPairScalarVector64NonTemporal:
GetEmitter()->emitIns_R_R_R(ins, emitTypeSize(intrin.baseType), targetReg, node->GetOtherReg(), op1Reg);
GetEmitter()->emitIns_R_R_R(ins, emitTypeSize(intrin.baseType), targetReg, node->GetRegByIndex(1),
op1Reg);
break;

case NI_AdvSimd_StoreSelectedScalar:
Expand Down
6 changes: 6 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,9 @@ HARDWARE_INTRINSIC(AdvSimd, LoadAndReplicateToVector64,
HARDWARE_INTRINSIC(AdvSimd, LoadAndReplicateToVector128, 16, 1, true, {INS_ld1r, INS_ld1r, INS_ld1r, INS_ld1r, INS_ld1r, INS_ld1r, INS_invalid, INS_invalid, INS_ld1r, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag)
HARDWARE_INTRINSIC(AdvSimd, LoadVector64, 8, 1, true, {INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AdvSimd, LoadVector128, 16, 1, true, {INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1, INS_ld1}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(AdvSimd, LoadVector64x2, 8, 1, true, {INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_invalid, INS_invalid, INS_ld2, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd, LoadVector64x3, 8, 1, true, {INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_invalid, INS_invalid, INS_ld3, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd, LoadVector64x4, 8, 1, true, {INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_invalid, INS_invalid, INS_ld4, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd, Max, -1, 2, true, {INS_smax, INS_umax, INS_smax, INS_umax, INS_smax, INS_umax, INS_invalid, INS_invalid, INS_fmax, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, MaxNumber, -1, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmaxnm, INS_invalid}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd, MaxNumberScalar, 8, 2, true, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmaxnm, INS_fmaxnm}, HW_Category_SIMD, HW_Flag_Commutative|HW_Flag_SIMDScalar)
Expand Down Expand Up @@ -573,6 +576,9 @@ HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector64,
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector64NonTemporal, 8, 1, true, {INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiReg|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector128, 16, 1, true, {INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp, INS_ldp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiReg|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadPairVector128NonTemporal, 16, 1, true, {INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp, INS_ldnp}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiReg|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadVector128x2, 16, 1, true, {INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2, INS_ld2}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadVector128x3, 16, 1, true, {INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3, INS_ld3}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd_Arm64, LoadVector128x4, 16, 1, true, {INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4, INS_ld4}, HW_Category_MemoryLoad, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport|HW_Flag_MultiReg|HW_Flag_NeedsConsecutiveRegisters)
HARDWARE_INTRINSIC(AdvSimd_Arm64, Max, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmax}, HW_Category_SIMD, HW_Flag_Commutative)
HARDWARE_INTRINSIC(AdvSimd_Arm64, MaxAcross, -1, 1, true, {INS_smaxv, INS_umaxv, INS_smaxv, INS_umaxv, INS_smaxv, INS_umaxv, INS_invalid, INS_invalid, INS_fmaxv, INS_invalid}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(AdvSimd_Arm64, MaxNumber, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_fmaxnm}, HW_Category_SIMD, HW_Flag_Commutative)
Expand Down
4 changes: 1 addition & 3 deletions src/coreclr/jit/lsra.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,9 +155,7 @@ void lsraAssignRegToTree(GenTree* tree, regNumber reg, unsigned regIdx)
#ifdef FEATURE_HW_INTRINSICS
else if (tree->OperIs(GT_HWINTRINSIC))
{
assert(regIdx == 1);
// TODO-ARM64-NYI: Support hardware intrinsics operating on multiple contiguous registers.
tree->AsHWIntrinsic()->SetOtherReg(reg);
tree->AsHWIntrinsic()->SetRegNumByIdx(reg, regIdx);
}
#endif // FEATURE_HW_INTRINSICS
else if (tree->OperIs(GT_LCL_VAR, GT_STORE_LCL_VAR))
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -2011,6 +2011,7 @@ class LinearScan : public LinearScanInterface
int BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCount);
#ifdef TARGET_ARM64
int BuildConsecutiveRegistersForUse(GenTree* treeNode, GenTree* rmwNode = nullptr);
void BuildConsecutiveRegistersForDef(GenTree* treeNode, int fieldCount);
#endif // TARGET_ARM64
#endif // FEATURE_HW_INTRINSICS

Expand Down
Loading

0 comments on commit b17fa8f

Please sign in to comment.