Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More XARCH PUTARG_STK CQ improvements #67400

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 18 additions & 25 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7908,43 +7908,36 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk)

#ifdef TARGET_X86

// On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
assert(targetType != TYP_LONG);
assert((putArgStk->GetStackByteSize() % TARGET_POINTER_SIZE) == 0);

genAlignStackBeforeCall(putArgStk);

if ((data->OperGet() != GT_FIELD_LIST) && varTypeIsStruct(targetType))
if (data->OperIs(GT_FIELD_LIST))
{
(void)genAdjustStackForPutArgStk(putArgStk);
genPutStructArgStk(putArgStk);
genPutArgStkFieldList(putArgStk);
return;
}

// On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT.
assert(targetType != TYP_LONG);

const unsigned argSize = putArgStk->GetStackByteSize();
assert((argSize % TARGET_POINTER_SIZE) == 0);

if (data->isContainedIntOrIImmed())
if (varTypeIsStruct(targetType))
{
if (data->IsIconHandle())
{
inst_IV_handle(INS_push, data->AsIntCon()->gtIconVal);
}
else
{
inst_IV(INS_push, data->AsIntCon()->gtIconVal);
}
AddStackLevel(argSize);
genAdjustStackForPutArgStk(putArgStk);
genPutStructArgStk(putArgStk);
return;
}
else if (data->OperGet() == GT_FIELD_LIST)

genConsumeRegs(data);

if (data->isUsedFromReg())
{
genPutArgStkFieldList(putArgStk);
genPushReg(targetType, data->GetRegNum());
}
else
{
// We should not see any contained nodes that are not immediates.
assert(data->isUsedFromReg());
genConsumeReg(data);
genPushReg(targetType, data->GetRegNum());
assert(genTypeSize(data) == TARGET_POINTER_SIZE);
inst_TT(INS_push, emitTypeSize(data), data);
AddStackLevel(TARGET_POINTER_SIZE);
}
#else // !TARGET_X86
{
Expand Down
191 changes: 106 additions & 85 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,8 @@ void Lowering::ContainBlockStoreAddress(GenTreeBlk* blkNode, unsigned size, GenT
//
void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
{
GenTree* src = putArgStk->gtGetOp1();
GenTree* src = putArgStk->gtGetOp1();
bool srcIsLocal = src->OperIsLocalRead();

if (src->OperIs(GT_FIELD_LIST))
{
Expand Down Expand Up @@ -530,109 +531,129 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
}

#ifdef FEATURE_PUT_STRUCT_ARG_STK
if (src->TypeGet() != TYP_STRUCT)
#endif // FEATURE_PUT_STRUCT_ARG_STK
if (src->TypeIs(TYP_STRUCT))
{
// If the child of GT_PUTARG_STK is a constant, we don't need a register to
// move it to memory (stack location).
//
// On AMD64, we don't want to make 0 contained, because we can generate smaller code
// by zeroing a register and then storing it. E.g.:
// xor rdx, rdx
// mov gword ptr [rsp+28H], rdx
// is 2 bytes smaller than:
// mov gword ptr [rsp+28H], 0
//
// On x86, we push stack arguments; we don't use 'mov'. So:
// push 0
// is 1 byte smaller than:
// xor rdx, rdx
// push rdx
ClassLayout* layout = src->AsObj()->GetLayout();
var_types regType = layout->GetRegisterType();
srcIsLocal |= src->AsObj()->Addr()->OperIsLocalAddr();

if (IsContainableImmed(putArgStk, src)
#if defined(TARGET_AMD64)
&& !src->IsIntegralConst(0)
#endif // TARGET_AMD64
)
if (regType == TYP_UNDEF)
{
MakeSrcContained(putArgStk, src);
}
return;
}

#ifdef FEATURE_PUT_STRUCT_ARG_STK
GenTree* srcAddr = nullptr;

bool haveLocalAddr = false;
if ((src->OperGet() == GT_OBJ) || (src->OperGet() == GT_IND))
{
srcAddr = src->AsOp()->gtOp1;
assert(srcAddr != nullptr);
haveLocalAddr = srcAddr->OperIsLocalAddr();
}
else
{
assert(varTypeIsSIMD(putArgStk));
}

ClassLayout* layout = src->AsObj()->GetLayout();
// In case of a CpBlk we could use a helper call. In case of putarg_stk we
// can't do that since the helper call could kill some already set up outgoing args.
// TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
// The cpyXXXX code is rather complex and this could cause it to be more complex, but
// it might be the right thing to do.

// In case of a CpBlk we could use a helper call. In case of putarg_stk we
// can't do that since the helper call could kill some already set up outgoing args.
// TODO-Amd64-Unix: converge the code for putarg_stk with cpyblk/cpyobj.
// The cpyXXXX code is rather complex and this could cause it to be more complex, but
// it might be the right thing to do.
unsigned size = putArgStk->GetStackByteSize();
unsigned loadSize = layout->GetSize();

unsigned size = putArgStk->GetStackByteSize();
assert(loadSize <= size);

// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).
// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).

if (!layout->HasGCPtr())
{
if (!layout->HasGCPtr())
{
#ifdef TARGET_X86
if (size < XMM_REGSIZE_BYTES)
{
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
if (size < XMM_REGSIZE_BYTES)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
#endif // TARGET_X86
if (size <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
if (size <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
else
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
}
}
else // There are GC pointers.
{
#ifdef TARGET_X86
// On x86, we must use `push` to store GC references to the stack in order for the emitter to
// properly update the function's GC info. These `putargstk` nodes will generate a sequence of
// `push` instructions.
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#else // !TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr;
#endif // !TARGET_X86
}

// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
MakeSrcContained(putArgStk, src);
if (src->OperIs(GT_OBJ) && src->AsObj()->Addr()->OperIsLocalAddr())
{
// If the source address is the address of a lclVar, make the source address contained to avoid
// unnecessary copies.
MakeSrcContained(putArgStk, src->AsObj()->Addr());
}
}
else
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr;
// The ABI allows upper bits of small struct args to remain undefined,
// so if possible, widen the load to avoid the sign/zero-extension.
if (varTypeIsSmall(regType) && srcIsLocal)
{
assert(putArgStk->GetStackByteSize() <= genTypeSize(TYP_INT));
regType = TYP_INT;
}

src->SetOper(GT_IND);
src->ChangeType(regType);
LowerIndir(src->AsIndir());
}
}
else // There are GC pointers.

if (src->TypeIs(TYP_STRUCT))
{
#ifdef TARGET_X86
// On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update
// the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions.
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
#else // !TARGET_X86
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr;
#endif // !TARGET_X86
return;
}
#endif // FEATURE_PUT_STRUCT_ARG_STK

assert(!src->TypeIs(TYP_STRUCT));

// If the child of GT_PUTARG_STK is a constant, we don't need a register to
// move it to memory (stack location).
//
// On AMD64, we don't want to make 0 contained, because we can generate smaller code
// by zeroing a register and then storing it. E.g.:
// xor rdx, rdx
// mov gword ptr [rsp+28H], rdx
// is 2 bytes smaller than:
// mov gword ptr [rsp+28H], 0
//
// On x86, we push stack arguments; we don't use 'mov'. So:
// push 0
// is 1 byte smaller than:
// xor rdx, rdx
// push rdx

// Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree.
MakeSrcContained(putArgStk, src);
if (haveLocalAddr)
if (IsContainableImmed(putArgStk, src)
#if defined(TARGET_AMD64)
&& !src->IsIntegralConst(0)
#endif // TARGET_AMD64
)
{
// If the source address is the address of a lclVar, make the source address contained to avoid unnecessary
// copies.
//
MakeSrcContained(putArgStk, srcAddr);
MakeSrcContained(putArgStk, src);
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
#ifdef TARGET_X86
else if ((genTypeSize(src) == TARGET_POINTER_SIZE) && IsContainableMemoryOp(src) &&
IsSafeToContainMem(putArgStk, src))
{
// Contain for "push [mem]".
MakeSrcContained(putArgStk, src);
}
#endif // TARGET_X86
}

/* Lower GT_CAST(srcType, DstType) nodes.
Expand Down
24 changes: 12 additions & 12 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1561,21 +1561,21 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
GenTree* src = putArgStk->gtOp1;
var_types type = src->TypeGet();

#if defined(FEATURE_SIMD) && defined(TARGET_X86)
// For PutArgStk of a TYP_SIMD12, we need an extra register.
if (putArgStk->isSIMD12())
if (type != TYP_STRUCT)
{
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
BuildUse(putArgStk->gtOp1);
srcCount = 1;
buildInternalRegisterUses();
return srcCount;
}
#if defined(FEATURE_SIMD) && defined(TARGET_X86)
// For PutArgStk of a TYP_SIMD12, we need an extra register.
if (putArgStk->isSIMD12())
{
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
BuildUse(src);
srcCount = 1;
buildInternalRegisterUses();
return srcCount;
}
#endif // defined(FEATURE_SIMD) && defined(TARGET_X86)

if (type != TYP_STRUCT)
{
return BuildSimple(putArgStk);
return BuildOperandUses(src);
}

ssize_t size = putArgStk->GetStackByteSize();
Expand Down