Skip to content
This repository has been archived by the owner on Jan 23, 2023. It is now read-only.

[RyuJIT/ARM32] Enabling fast tail call feature #14056

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/jit/codegencommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9676,7 +9676,12 @@ void CodeGen::genFnEpilog(BasicBlock* block)
// Call target = REG_FASTTAILCALL_TARGET
// https://github.com/dotnet/coreclr/issues/4827
// Do we need a special encoding for stack walker like rex.w prefix for x64?
CLANG_FORMAT_COMMENT_ANCHOR;
#ifdef _TARGET_ARM_
getEmitter()->emitIns_R(INS_bx, emitTypeSize(TYP_I_IMPL), REG_FASTTAILCALL_TARGET);
#else
getEmitter()->emitIns_R(INS_br, emitTypeSize(TYP_I_IMPL), REG_FASTTAILCALL_TARGET);
#endif
}
#endif // FEATURE_FASTTAILCALL
}
Expand Down
4 changes: 2 additions & 2 deletions src/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ inline bool isRegParamType(var_types type)
#endif // !_TARGET_X86_
}

#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARMARCH_)
/*****************************************************************************/
// Returns true if 'type' is a struct that can be enregistered for call args
// or can be returned by value in multiple registers.
Expand Down Expand Up @@ -724,7 +724,7 @@ inline bool Compiler::VarTypeIsMultiByteAndCanEnreg(var_types type,

return result;
}
#endif //_TARGET_AMD64_ || _TARGET_ARM64_
#endif //_TARGET_AMD64_ || _TARGET_ARMARCH_

/*****************************************************************************/

Expand Down
4 changes: 2 additions & 2 deletions src/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6708,7 +6708,7 @@ bool Compiler::impTailCallRetTypeCompatible(var_types callerRetType,
return true;
}

#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARMARCH_)
// Jit64 compat:
if (callerRetType == TYP_VOID)
{
Expand Down Expand Up @@ -6738,7 +6738,7 @@ bool Compiler::impTailCallRetTypeCompatible(var_types callerRetType,
{
return (varTypeIsIntegral(calleeRetType) || isCalleeRetTypMBEnreg) && (callerRetTypeSize == calleeRetTypeSize);
}
#endif // _TARGET_AMD64_ || _TARGET_ARM64_
#endif // _TARGET_AMD64_ || _TARGET_ARMARCH_

return false;
}
Expand Down
8 changes: 7 additions & 1 deletion src/jit/lclvars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,12 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo)
#endif // _TARGET_XXX_

#if FEATURE_FASTTAILCALL
#ifdef _TARGET_ARM_
if ((varDscInfo->stackArgSize / TARGET_POINTER_SIZE) % cAlign != 0)
{
varDscInfo->stackArgSize += TARGET_POINTER_SIZE;
}
#endif
varDscInfo->stackArgSize += (unsigned)roundUp(argSize, TARGET_POINTER_SIZE);
#endif // FEATURE_FASTTAILCALL
}
Expand Down Expand Up @@ -3571,7 +3577,7 @@ var_types LclVarDsc::lvaArgType()
}
}
#endif // !FEATURE_UNIX_AMD64_STRUCT_PASSING
#elif defined(_TARGET_ARM64_)
#elif defined(_TARGET_ARMARCH_)
if (type == TYP_STRUCT)
{
NYI("lvaArgType");
Expand Down
1 change: 0 additions & 1 deletion src/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,7 +1027,6 @@ GenTreePtr Lowering::NewPutArg(GenTreeCall* call, GenTreePtr arg, fgArgTabEntryP
if (info->isSplit)
{
assert(arg->OperGet() == GT_OBJ || arg->OperGet() == GT_FIELD_LIST);
// TODO: Need to check correctness for FastTailCall
if (call->IsFastTailCall())
{
NYI_ARM("lower: struct argument by fast tail call");
Expand Down
70 changes: 67 additions & 3 deletions src/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7419,6 +7419,19 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
++nCalleeArgs;
assert(args->OperIsList());
GenTreePtr argx = args->gtOp.gtOp1;
#ifdef _TARGET_ARM_
unsigned argAlign = 1;
codeGen->InferOpSizeAlign(argx, &argAlign);

argAlign = roundUp(argAlign, TARGET_POINTER_SIZE) / TARGET_POINTER_SIZE;

// We don't care float register because we will not use fast tailcall
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like it is worth tracking fastTailCall support for callee functions with floating point. Is there an issue to track adding support for this later?

In addition what is the rational for not implementing it now?

// for callee method using float register
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Do not track floating point registers for arm32. It is NYI.

if (calleeArgRegCount % argAlign != 0)
{
calleeArgRegCount++;
}
#endif

if (varTypeIsStruct(argx))
{
Expand All @@ -7441,8 +7454,9 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
}
if (objClass != nullptr)
{
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARM64_)
#if defined(_TARGET_AMD64_) || defined(_TARGET_ARMARCH_)

#ifndef _TARGET_ARM_
// hasMultiByteStackArgs will determine if the struct can be passed
// in registers. If it cannot we will break the loop and not
// fastTailCall. This is an implementation limitation
Expand All @@ -7451,6 +7465,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
unsigned typeSize = 0;
hasMultiByteStackArgs = hasMultiByteStackArgs ||
!VarTypeIsMultiByteAndCanEnreg(argx->TypeGet(), objClass, &typeSize, false);
#endif // !_TARGET_ARM_

#if defined(FEATURE_UNIX_AMD64_STRUCT_PASSING)
SYSTEMV_AMD64_CORINFO_STRUCT_REG_PASSING_DESCRIPTOR structDesc;
Expand Down Expand Up @@ -7517,6 +7532,45 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
calleeArgRegCount += size;
}

#elif defined(_TARGET_ARM_) // ARM
var_types hfaType = GetHfaType(argx);
bool isHfaArg = varTypeIsFloating(hfaType);
size_t size = 1;

if (isHfaArg)
{
reportFastTailCallDecision("Callee uses float register arguments.", 0, 0);
return false;
}
else
{
size = (unsigned)(roundUp(info.compCompHnd->getClassSize(objClass), TARGET_POINTER_SIZE)) /
TARGET_POINTER_SIZE;
// We cannot handle split struct yet
// TODO: Fix to calculate exact count
if ((calleeArgRegCount < MAX_REG_ARG) && (size + calleeArgRegCount > MAX_REG_ARG))
{
reportFastTailCallDecision("Callee uses split struct argument.", 0, 0);
return false;
}

if (size > 1)
{
// hasTwoSlotSizedStruct will determine if the struct value can be passed multiple slot.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: two spaces in between passed and multiple

// We set hasTwoSlotSizedStruct if size > 1 because all struct are passed by value on ARM32.
hasTwoSlotSizedStruct = true;
if (calleeArgRegCount >= MAX_REG_ARG)
{
// hasMultiByteStackArgs will determine if the struct can be passed
// in registers. If it cannot we will break the loop and not
// fastTailCall. This is an implementation limitation
// where the callee only is checked for non enregisterable structs.
// It is tracked with https://github.com/dotnet/coreclr/issues/12644.
hasMultiByteStackArgs = true;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be using hasMultByteStackArgs in a undesired way as this struct can be passed in registers, there are just more callee arguments and we have to spill. Plus, I am not a big fan of the original hasMultiByteStackArgs code path I would like to avoid building on it.

I think the preferred way of dealing with the case of calleeArgRegCount >= MAX_REG_ARG && hasTwoSlotSizedStruct is here: https://github.com/dotnet/coreclr/blob/master/src/jit/morph.cpp#L7524.

}
}
calleeArgRegCount += size;
}
#elif defined(WINDOWS_AMD64_ABI)

++calleeArgRegCount;
Expand All @@ -7526,7 +7580,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
#else
assert(!"Target platform ABI rules regarding passing struct type args in registers");
unreached();
#endif //_TARGET_AMD64_ || _TARGET_ARM64_
#endif //_TARGET_AMD64_ || _TARGET_ARMARCH_
}
else
{
Expand All @@ -7535,7 +7589,17 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
}
else
{
#ifdef _TARGET_ARM_
if (varTypeIsFloating(argx))
{
return false;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add logging to this return using reportFastTailCall decision.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also adding some sort of NYI or comment here explaining why this returns false would be nice.

}
unsigned size = genTypeStSz(argx->gtType);

varTypeIsFloating(argx) ? calleeFloatArgRegCount += size : calleeArgRegCount += size;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems better to just have this as:

calleeArgRegCount += size

#else // !_TARGET_ARM_
varTypeIsFloating(argx) ? ++calleeFloatArgRegCount : ++calleeArgRegCount;
#endif // !_TARGET_ARM_
}

// We can break early on multiByte cases.
Expand Down Expand Up @@ -7587,7 +7651,7 @@ bool Compiler::fgCanFastTailCall(GenTreeCall* callee)
return false;
}

#elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)) || defined(_TARGET_ARM64_)
#elif (defined(_TARGET_AMD64_) && defined(UNIX_AMD64_ABI)) || defined(_TARGET_ARMARCH_)

// For *nix Amd64 and Arm64 check to see if all arguments for the callee
// and caller are passing in registers. If not, ensure that the outgoing argument stack size
Expand Down
4 changes: 4 additions & 0 deletions src/jit/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,11 @@ typedef unsigned short regPairNoSmall; // arm: need 12 bits
#define FEATURE_FIXED_OUT_ARGS 1 // Preallocate the outgoing arg area in the prolog
#define FEATURE_STRUCTPROMOTE 1 // JIT Optimization to promote fields of structs into registers
#define FEATURE_MULTIREG_STRUCT_PROMOTE 0 // True when we want to promote fields of a multireg struct into registers
#ifdef LEGACY_BACKEND
#define FEATURE_FASTTAILCALL 0 // Tail calls made as epilog+jmp
#else
#define FEATURE_FASTTAILCALL 1
#endif
#define FEATURE_TAILCALL_OPT 0 // opportunistic Tail calls (i.e. without ".tail" prefix) made as fast tail calls.
#define FEATURE_SET_FLAGS 1 // Set to true to force the JIT to mark the trees with GTF_SET_FLAGS when the flags need to be set
#define FEATURE_MULTIREG_ARGS_OR_RET 1 // Support for passing and/or returning single values in more than one register (including HFA support)
Expand Down