Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update the JIT to support rewriting more complex intrinsics as user calls #102702

Merged
merged 6 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 79 additions & 4 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4128,6 +4128,8 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
// first tree to be evaluated, and "lvl2" - the second.
if (multiOp->IsReverseOp())
{
assert(!multiOp->AsHWIntrinsic()->IsUserCall());

level = gtSetEvalOrder(multiOp->Op(2));
lvl2 = gtSetEvalOrder(multiOp->Op(1));
}
Expand All @@ -4140,11 +4142,18 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
// We want the more complex tree to be evaluated first.
if (level < lvl2)
{
bool canSwap = multiOp->IsReverseOp() ? gtCanSwapOrder(multiOp->Op(2), multiOp->Op(1))
: gtCanSwapOrder(multiOp->Op(1), multiOp->Op(2));
bool canSwap = false;

if (!multiOp->AsHWIntrinsic()->IsUserCall())
{
canSwap = multiOp->IsReverseOp() ? gtCanSwapOrder(multiOp->Op(2), multiOp->Op(1))
: gtCanSwapOrder(multiOp->Op(1), multiOp->Op(2));
}

if (canSwap)
{
assert(!multiOp->AsHWIntrinsic()->IsUserCall());

if (multiOp->IsReverseOp())
{
multiOp->ClearReverseOp();
Expand Down Expand Up @@ -6563,7 +6572,7 @@ bool GenTree::OperSupportsReverseOpEvalOrder(Compiler* comp) const
#if defined(FEATURE_SIMD) || defined(FEATURE_HW_INTRINSICS)
if (OperIsMultiOp())
{
return AsMultiOp()->GetOperandCount() == 2;
return (AsMultiOp()->GetOperandCount() == 2) && !AsMultiOp()->IsUserCall();
}
#endif // FEATURE_SIMD || FEATURE_HW_INTRINSICS
return false;
Expand Down Expand Up @@ -9711,6 +9720,11 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree)
tree->AsHWIntrinsic()->GetHWIntrinsicId(),
tree->AsHWIntrinsic()->GetSimdBaseJitType(), tree->AsHWIntrinsic()->GetSimdSize());
copy->AsHWIntrinsic()->SetAuxiliaryJitType(tree->AsHWIntrinsic()->GetAuxiliaryJitType());

if (tree->AsHWIntrinsic()->IsUserCall())
{
copy->AsHWIntrinsic()->SetMethodHandle(this, tree->AsHWIntrinsic()->GetMethodHandle());
}
goto CLONE_MULTIOP_OPERANDS;
#endif
#if defined(FEATURE_SIMD) || defined(FEATURE_HW_INTRINSICS)
Expand Down Expand Up @@ -19570,6 +19584,67 @@ void GenTreeMultiOp::InitializeOperands(GenTree** operands, size_t operandCount)
SetOperandCount(operandCount);
}

//------------------------------------------------------------------------
// GenTreeJitIntrinsic::SetMethodHandle: Sets the method handle for an intrinsic
// so that it can be rewritten back to a user call in a later phase
//
// Arguments:
// comp - The compiler instance
// methodHandle - The method handle representing the fallback handling for the intrinsic
//
// Notes:
// We need to ensure that the operands are not tracked inline so that we can track the
// underlying method handle. See the comment in GenTreeJitIntrinsic around why the union
// of fields exists.
//
void GenTreeJitIntrinsic::SetMethodHandle(Compiler* comp, CORINFO_METHOD_HANDLE methodHandle)
{
assert(OperIsHWIntrinsic() && !IsUserCall());
gtFlags |= GTF_HW_USER_CALL;

size_t operandCount = GetOperandCount();

if ((operandCount != 0) && (operandCount <= ArrLen(gtInlineOperands)))
{
GenTree** oldOperands = GetOperandArray();
GenTree** newOperands = comp->getAllocator(CMK_ASTNode).allocate<GenTree*>(operandCount);

ResetOperandArray(operandCount, comp, newOperands, operandCount);
assert(GetOperandArray() == newOperands);

for (size_t i = 0; i < operandCount; i++)
{
newOperands[i] = oldOperands[i];
}
}

gtMethodHandle = methodHandle;
gtEntryPoint = nullptr;
}

#if defined(FEATURE_READYTORUN)
//------------------------------------------------------------------------
// GenTreeJitIntrinsic::SetEntryPoint: Sets the entry point for an intrinsic
// so that it can be rewritten back to a user call in a later phase for R2R
// scenarios
//
// Arguments:
// comp - The compiler instance
// entryPoint - The entry point information required for R2R scenarios
//
// Notes:
// This requires SetMethodHandle to have been called first to ensure we aren't
// overwriting any inline operands
//
void GenTreeJitIntrinsic::SetEntryPoint(Compiler* comp, CORINFO_CONST_LOOKUP entryPoint)
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
{
assert(IsUserCall());
assert(gtEntryPoint == nullptr);

gtEntryPoint = new (comp, CMK_ASTNode) CORINFO_CONST_LOOKUP(entryPoint);
}
#endif // FEATURE_READYTORUN

var_types GenTreeJitIntrinsic::GetAuxiliaryType() const
{
CorInfoType auxiliaryJitType = GetAuxiliaryJitType();
Expand Down Expand Up @@ -27036,7 +27111,7 @@ bool GenTreeHWIntrinsic::OperRequiresCallFlag() const
}
}

return false;
return IsUserCall();
}

//------------------------------------------------------------------------------
Expand Down
52 changes: 51 additions & 1 deletion src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ enum GenTreeFlags : unsigned int

#ifdef FEATURE_HW_INTRINSICS
GTF_HW_EM_OP = 0x10000000, // GT_HWINTRINSIC -- node is used as an operand to an embedded mask
GTF_HW_USER_CALL = 0x20000000, // GT_HWINTRINSIC -- node is implemented via a user call
#endif // FEATURE_HW_INTRINSICS
};

Expand Down Expand Up @@ -6089,6 +6090,15 @@ struct GenTreeMultiOp : public GenTree
}
#endif

bool IsUserCall() const
{
#if defined(FEATURE_HW_INTRINSICS)
return OperIs(GT_HWINTRINSIC) && (gtFlags & GTF_HW_USER_CALL) != 0;
#else
return false;
#endif
}

GenTree*& Op(size_t index)
{
size_t actualIndex = index - 1;
Expand Down Expand Up @@ -6217,7 +6227,29 @@ class IntrinsicNodeBuilder final
struct GenTreeJitIntrinsic : public GenTreeMultiOp
{
protected:
GenTree* gtInlineOperands[2];
union
{
// We don't have enough space to carry both the inline operands
// and the necessary information required to support rewriting
// the intrinsic back into a user call. As such, we union the
// data instead and use the GTF_HW_USER_CALL flag to indicate
// which fields are valid to access. -- Tracking the fields
// independently causes TREE_NODE_SZ_LARGE to increase and for
// GenTreeJitIntrinsic to become the largest node, which is
// undesirable, so this approach helps keep things pay-for-play.

GenTree* gtInlineOperands[2];
Copy link
Member

@EgorBo EgorBo May 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think I follow what gtInlineOperands is (and the logic inside SetMethodHandle does, can you explain?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gtInlineOperands is an existing field that allows GenTreeMultiOp to avoid allocating for the common case where there is 2 or less operands and instead allows us to carry the fields within the main allocation instead.

In order to facilitate rewriting the hwintrinsic back to a call, however, we need to be able to track the CORINFO_METHOD_HANDLE and we don't have enough free space to do that. Adding the field directly ends up making the node larger than TREE_NODE_SZ_LARGE as well, so to avoid pessimizing the rest of the JIT I made this into a union. SetMethodHandle will then force an allocation if gtInlineOperands was being used and set a flag that indicates that gtMethodHandle is set instead.

It's worth noting the actual gtInlineOperands field isn't ever read directly either, it's address is just handed down to GenTreeMultiOp as part of construction or ResetOperandArray call. So this ends up working fairly well and ensures that the allocation is "pay for play" and only used when actually necessary (we have an intrinsic with 1 or 2 operands that requires a constant and needs to be carried through to a later phase).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-- Notably the reason we don't have enough free space is primarily because of padding bytes caused by inheritance. There's a few places where nodes are wasting 4-7 bytes of space to maintain 8-byte alignment and that repeats several times.

We could avoid tricks like the one being employed here if we had a better mechanism for avoiding such wasted padding for derived node kinds. But that's a much more complex and independent work item.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I didn't notice it was a pre-existing field - I though you added it in this PR 🙂


struct
{
CORINFO_METHOD_HANDLE gtMethodHandle;

#if defined(FEATURE_READYTORUN)
// Call target lookup info for method call from a Ready To Run module
CORINFO_CONST_LOOKUP* gtEntryPoint;
#endif // FEATURE_READYTORUN
};
};
regNumberSmall gtOtherReg; // The second register for multi-reg intrinsics.
MultiRegSpillFlags gtSpillFlags; // Spill flags for multi-reg intrinsics.
unsigned char gtAuxiliaryJitType; // For intrinsics than need another type (e.g. Avx2.Gather* or SIMD (by element))
Expand All @@ -6226,6 +6258,24 @@ struct GenTreeJitIntrinsic : public GenTreeMultiOp
NamedIntrinsic gtHWIntrinsicId;

public:
CORINFO_METHOD_HANDLE GetMethodHandle() const
{
assert(IsUserCall());
return gtMethodHandle;
}

void SetMethodHandle(Compiler* comp, CORINFO_METHOD_HANDLE methodHandle);

#if defined(FEATURE_READYTORUN)
CORINFO_CONST_LOOKUP GetEntryPoint() const
{
assert(IsUserCall());
return *gtEntryPoint;
}

void SetEntryPoint(Compiler* comp, CORINFO_CONST_LOOKUP entryPoint);
#endif // FEATURE_READYTORUN

//-----------------------------------------------------------
// GetRegNumByIdx: Get regNumber of i'th position.
//
Expand Down
9 changes: 8 additions & 1 deletion src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1869,7 +1869,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic,

if (!indices->IsVectorConst())
{
// TODO-ARM64-CQ: Handling non-constant indices is a bit more complex
assert(sig->numArgs == 2);

op2 = impSIMDPopStack();
op1 = impSIMDPopStack();

retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, simdBaseJitType, simdSize);

retNode->AsHWIntrinsic()->SetMethodHandle(this, method);
break;
}

Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/hwintrinsiclistarm64.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ HARDWARE_INTRINSIC(Vector64, op_UnsignedRightShift,
HARDWARE_INTRINSIC(Vector64, ShiftLeft, 8, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, ShiftRightArithmetic, 8, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, ShiftRightLogical, 8, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, Shuffle, 8, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, Shuffle, 8, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Vector64, Sqrt, 8, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector64, Store, 8, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector64, StoreAligned, 8, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down Expand Up @@ -220,7 +220,7 @@ HARDWARE_INTRINSIC(Vector128, op_UnaryPlus,
HARDWARE_INTRINSIC(Vector128, ShiftLeft, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, ShiftRightArithmetic, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, ShiftRightLogical, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, Shuffle, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, Shuffle, 16, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId)
HARDWARE_INTRINSIC(Vector128, Store, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg)
Expand Down
Loading
Loading