diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index a2e907db09d12..9bf02f45f8cd1 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -242,6 +242,7 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1, // stp REG, REG + 1, [SP, #offset] // 64-bit STP offset range: -512 to 504, multiple of 8. assert(spOffset <= 504); + assert((spOffset % 8) == 0); GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, reg1, reg2, REG_SPBASE, spOffset); #if defined(TARGET_UNIX) diff --git a/src/coreclr/src/jit/codegenarmarch.cpp b/src/coreclr/src/jit/codegenarmarch.cpp index 6e987ec52f139..d51353f48aa30 100644 --- a/src/coreclr/src/jit/codegenarmarch.cpp +++ b/src/coreclr/src/jit/codegenarmarch.cpp @@ -665,9 +665,13 @@ void CodeGen::genIntrinsic(GenTree* treeNode) void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode) { assert(treeNode->OperIs(GT_PUTARG_STK)); - GenTree* source = treeNode->gtOp1; + GenTree* source = treeNode->gtOp1; +#if !defined(OSX_ARM64_ABI) var_types targetType = genActualType(source->TypeGet()); - emitter* emit = GetEmitter(); +#else + var_types targetType = source->TypeGet(); +#endif + emitter* emit = GetEmitter(); // This is the varNum for our store operations, // typically this is the varNum for the Outgoing arg space @@ -678,12 +682,12 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode) // Get argument offset to use with 'varNumOut' // Here we cross check that argument offset hasn't changed from lowering to codegen since // we are storing arg slot number in GT_PUTARG_STK node in lowering phase. - unsigned argOffsetOut = treeNode->gtSlotNum * TARGET_POINTER_SIZE; + unsigned argOffsetOut = treeNode->getArgOffset(); #ifdef DEBUG fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(treeNode->gtCall, treeNode); - assert(curArgTabEntry); - assert(argOffsetOut == (curArgTabEntry->slotNum * TARGET_POINTER_SIZE)); + assert(curArgTabEntry != nullptr); + DEBUG_ARG_SLOTS_ASSERT(argOffsetOut == (curArgTabEntry->slotNum * TARGET_POINTER_SIZE)); #endif // DEBUG // Whether to setup stk arg in incoming or out-going arg area? @@ -730,6 +734,21 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* treeNode) return; } +#if defined(OSX_ARM64_ABI) + switch (treeNode->GetStackByteSize()) + { + case 1: + targetType = TYP_BYTE; + break; + case 2: + targetType = TYP_SHORT; + break; + default: + assert(treeNode->GetStackByteSize() >= 4); + break; + } +#endif + instruction storeIns = ins_Store(targetType); emitAttr storeAttr = emitTypeSize(targetType); @@ -1161,7 +1180,7 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode) emitter* emit = GetEmitter(); unsigned varNumOut = compiler->lvaOutgoingArgSpaceVar; unsigned argOffsetMax = compiler->lvaOutgoingArgSpaceSize; - unsigned argOffsetOut = treeNode->gtSlotNum * TARGET_POINTER_SIZE; + unsigned argOffsetOut = treeNode->getArgOffset(); if (source->OperGet() == GT_FIELD_LIST) { @@ -1292,13 +1311,12 @@ void CodeGen::genPutArgSplit(GenTreePutArgSplit* treeNode) assert(!compiler->IsHfa(source->AsObj()->GetLayout()->GetClassHandle())); } - int structSize = treeNode->getArgSize(); - ClassLayout* layout = source->AsObj()->GetLayout(); + ClassLayout* layout = source->AsObj()->GetLayout(); // Put on stack first unsigned nextIndex = treeNode->gtNumRegs; unsigned structOffset = nextIndex * TARGET_POINTER_SIZE; - int remainingSize = structSize - structOffset; + int remainingSize = treeNode->GetStackByteSize(); // remainingSize is always multiple of TARGET_POINTER_SIZE assert(remainingSize % TARGET_POINTER_SIZE == 0); diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp index ec19e5c005d76..fd576344a38fa 100644 --- a/src/coreclr/src/jit/codegencommon.cpp +++ b/src/coreclr/src/jit/codegencommon.cpp @@ -2355,7 +2355,7 @@ void CodeGen::genEmitMachineCode() } #endif -#if EMIT_TRACK_STACK_DEPTH && defined(DEBUG) && !defined(OSX_ARM64_ABI) +#if EMIT_TRACK_STACK_DEPTH && defined(DEBUG_ARG_SLOTS) // Check our max stack level. Needed for fgAddCodeRef(). // We need to relax the assert as our estimation won't include code-gen // stack changes (which we know don't affect fgAddCodeRef()). diff --git a/src/coreclr/src/jit/codegenlinear.cpp b/src/coreclr/src/jit/codegenlinear.cpp index f3f52e7fb77b8..bf8d1ce087adf 100644 --- a/src/coreclr/src/jit/codegenlinear.cpp +++ b/src/coreclr/src/jit/codegenlinear.cpp @@ -1704,8 +1704,6 @@ void CodeGen::genConsumePutStructArgStk(GenTreePutArgStk* putArgNode, assert((src->gtOper == GT_OBJ) || ((src->gtOper == GT_IND && varTypeIsSIMD(src)))); GenTree* srcAddr = src->gtGetOp1(); - unsigned int size = putArgNode->getArgSize(); - assert(dstReg != REG_NA); assert(srcReg != REG_NA); @@ -1757,6 +1755,7 @@ void CodeGen::genConsumePutStructArgStk(GenTreePutArgStk* putArgNode, if (sizeReg != REG_NA) { + unsigned size = putArgNode->GetStackByteSize(); inst_RV_IV(INS_mov, sizeReg, size, EA_PTRSIZE); } } diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp index 259f331470237..e407cf40dbab8 100644 --- a/src/coreclr/src/jit/codegenxarch.cpp +++ b/src/coreclr/src/jit/codegenxarch.cpp @@ -3156,7 +3156,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // in genPutStructArgStk(). noway_assert(src->TypeGet() == TYP_STRUCT); - unsigned size = putArgNode->getArgSize(); + unsigned size = putArgNode->GetStackByteSize(); assert(size <= CPBLK_UNROLL_LIMIT); emitter* emit = GetEmitter(); @@ -5049,7 +5049,7 @@ void CodeGen::genCallInstruction(GenTreeCall* call) if (arg->OperIs(GT_PUTARG_STK) && ((arg->gtFlags & GTF_LATE_ARG) == 0)) { GenTree* source = arg->AsPutArgStk()->gtGetOp1(); - unsigned size = arg->AsPutArgStk()->getArgSize(); + unsigned size = arg->AsPutArgStk()->GetStackByteSize(); stackArgBytes += size; #ifdef DEBUG fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(call, arg); @@ -7292,7 +7292,7 @@ void CodeGen::genRemoveAlignmentAfterCall(GenTreeCall* call, unsigned bias) // bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) { - const unsigned argSize = putArgStk->getArgSize(); + const unsigned argSize = putArgStk->GetStackByteSize(); GenTree* source = putArgStk->gtGetOp1(); #ifdef FEATURE_SIMD @@ -7389,7 +7389,7 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) // If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them // in reverse order, so we start with the current field offset at the size of the struct arg (which must be // a multiple of the target pointer size). - unsigned currentOffset = (preAdjustedStack) ? 0 : putArgStk->getArgSize(); + unsigned currentOffset = (preAdjustedStack) ? 0 : putArgStk->GetStackByteSize(); unsigned prevFieldOffset = currentOffset; regNumber intTmpReg = REG_NA; regNumber simdTmpReg = REG_NA; @@ -7601,7 +7601,7 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) // On a 32-bit target, all of the long arguments are handled with GT_FIELD_LISTs of TYP_INT. assert(targetType != TYP_LONG); - const unsigned argSize = putArgStk->getArgSize(); + const unsigned argSize = putArgStk->GetStackByteSize(); assert((argSize % TARGET_POINTER_SIZE) == 0); if (data->isContainedIntOrIImmed()) @@ -7653,12 +7653,12 @@ void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) // Get argument offset on stack. // Here we cross check that argument offset hasn't changed from lowering to codegen since // we are storing arg slot number in GT_PUTARG_STK node in lowering phase. - int argOffset = putArgStk->getArgOffset(); + unsigned argOffset = putArgStk->getArgOffset(); #ifdef DEBUG fgArgTabEntry* curArgTabEntry = compiler->gtArgEntryByNode(putArgStk->gtCall, putArgStk); - assert(curArgTabEntry); - assert(argOffset == (int)curArgTabEntry->slotNum * TARGET_POINTER_SIZE); + assert(curArgTabEntry != nullptr); + assert(argOffset == curArgTabEntry->slotNum * TARGET_POINTER_SIZE); #endif if (data->isContainedIntOrIImmed()) @@ -7899,7 +7899,10 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) assert(m_pushStkArg); GenTree* srcAddr = source->gtGetOp1(); - const unsigned numSlots = putArgStk->gtNumSlots; + const unsigned byteSize = putArgStk->GetStackByteSize(); + assert(byteSize % TARGET_POINTER_SIZE == 0); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgStk->gtNumSlots == numSlots); regNumber srcRegNum = srcAddr->GetRegNum(); const bool srcAddrInReg = srcRegNum != REG_NA; @@ -7920,15 +7923,15 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) for (int i = numSlots - 1; i >= 0; --i) { - emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); - const unsigned offset = i * TARGET_POINTER_SIZE; + emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); + const unsigned byteOffset = i * TARGET_POINTER_SIZE; if (srcAddrInReg) { - GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, offset); + GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, byteOffset); } else { - GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + offset); + GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + byteOffset); } AddStackLevel(TARGET_POINTER_SIZE); } @@ -7945,7 +7948,10 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) unsigned numGCSlotsCopied = 0; #endif // DEBUG - const unsigned numSlots = putArgStk->gtNumSlots; + const unsigned byteSize = putArgStk->GetStackByteSize(); + assert(byteSize % TARGET_POINTER_SIZE == 0); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgStk->gtNumSlots == numSlots); for (unsigned i = 0; i < numSlots;) { if (!layout->IsGCPtr(i)) diff --git a/src/coreclr/src/jit/compiler.h b/src/coreclr/src/jit/compiler.h index 0770b2821a4d2..ab6f7602c3ffb 100644 --- a/src/coreclr/src/jit/compiler.h +++ b/src/coreclr/src/jit/compiler.h @@ -1630,7 +1630,9 @@ struct fgArgTabEntry return 0; } - // Get the number of bytes that this argument is occupying on the stack. + // Get the number of bytes that this argument is occupying on the stack, + // including padding up to the target pointer size for platforms + // where a stack argument can't take less. unsigned GetStackByteSize() const { if (!IsSplit() && numRegs > 0) @@ -1642,7 +1644,10 @@ struct fgArgTabEntry assert(GetByteSize() > TARGET_POINTER_SIZE * numRegs); unsigned stackByteSize = GetByteSize() - TARGET_POINTER_SIZE * numRegs; - return GetByteSize() - TARGET_POINTER_SIZE * numRegs; +#if !defined(OSX_ARM64_ABI) + stackByteSize = roundUp(stackByteSize, TARGET_POINTER_SIZE); +#endif + return stackByteSize; } var_types GetHfaType() const @@ -1800,7 +1805,7 @@ struct fgArgTabEntry return size; } -#endif // DEBUG && !OSX_ARM64_ABI +#endif // DEBUG_ARG_SLOTS private: unsigned m_byteOffset; diff --git a/src/coreclr/src/jit/flowgraph.cpp b/src/coreclr/src/jit/flowgraph.cpp index 8ee9e69ac5c8d..7e2f3b7f8b93a 100644 --- a/src/coreclr/src/jit/flowgraph.cpp +++ b/src/coreclr/src/jit/flowgraph.cpp @@ -9807,6 +9807,8 @@ void Compiler::fgSimpleLowering() JITDUMP("Bumping outgoingArgSpaceSize to %u for localloc", outgoingArgSpaceSize); } + assert((outgoingArgSpaceSize % TARGET_POINTER_SIZE) == 0); + // Publish the final value and mark it as read only so any update // attempt later will cause an assert. lvaOutgoingArgSpaceSize = outgoingArgSpaceSize; diff --git a/src/coreclr/src/jit/gentree.cpp b/src/coreclr/src/jit/gentree.cpp index e2ae7339c6d50..e9c8066332968 100644 --- a/src/coreclr/src/jit/gentree.cpp +++ b/src/coreclr/src/jit/gentree.cpp @@ -1206,7 +1206,7 @@ bool GenTreeCall::Equals(GenTreeCall* c1, GenTreeCall* c2) } #if !defined(FEATURE_PUT_STRUCT_ARG_STK) -unsigned GenTreePutArgStk::getArgSize() +unsigned GenTreePutArgStk::GetStackByteSize() const { return genTypeSize(genActualType(gtOp1->gtType)); } @@ -11523,10 +11523,16 @@ void Compiler::gtDispTree(GenTree* tree, #if FEATURE_PUT_STRUCT_ARG_STK else if (tree->OperGet() == GT_PUTARG_STK) { - printf(" (%d slots)", tree->AsPutArgStk()->gtNumSlots); - if (tree->AsPutArgStk()->gtPutArgStkKind != GenTreePutArgStk::Kind::Invalid) + const GenTreePutArgStk* putArg = tree->AsPutArgStk(); +#if !defined(DEBUG_ARG_SLOTS) + printf(" (%d stackByteSize), (%d byteOffset)", putArg->GetStackByteSize(), putArg->getArgOffset()); +#else + printf(" (%d slots), (%d stackByteSize), (%d slot), (%d byteOffset)", putArg->gtNumSlots, + putArg->GetStackByteSize(), putArg->gtSlotNum, putArg->getArgOffset()); +#endif + if (putArg->gtPutArgStkKind != GenTreePutArgStk::Kind::Invalid) { - switch (tree->AsPutArgStk()->gtPutArgStkKind) + switch (putArg->gtPutArgStkKind) { case GenTreePutArgStk::Kind::RepInstr: printf(" (RepInstr)"); @@ -11545,6 +11551,18 @@ void Compiler::gtDispTree(GenTree* tree, } } } +#if FEATURE_ARG_SPLIT + else if (tree->OperGet() == GT_PUTARG_SPLIT) + { + const GenTreePutArgSplit* putArg = tree->AsPutArgSplit(); +#if !defined(DEBUG_ARG_SLOTS) + printf(" (%d stackByteSize), (%d numRegs)", putArg->GetStackByteSize(), putArg->gtNumRegs); +#else + printf(" (%d slots), (%d stackByteSize), (%d numRegs)", putArg->gtNumSlots, putArg->GetStackByteSize(), + putArg->gtNumRegs); +#endif + } +#endif // FEATURE_ARG_SPLIT #endif // FEATURE_PUT_STRUCT_ARG_STK if (tree->gtOper == GT_INTRINSIC) @@ -12007,9 +12025,7 @@ void Compiler::gtGetArgMsg(GenTreeCall* call, GenTree* arg, unsigned argNum, cha } #endif // TARGET_ARM #if FEATURE_FIXED_OUT_ARGS - sprintf_s(bufp, bufLength, "arg%d out+%02x%c", argNum, curArgTabEntry->slotNum * TARGET_POINTER_SIZE, 0); - #else sprintf_s(bufp, bufLength, "arg%d on STK%c", argNum, 0); #endif @@ -12052,8 +12068,7 @@ void Compiler::gtGetLateArgMsg(GenTreeCall* call, GenTree* argx, int lateArgInde #else if (argReg == REG_STK) { - sprintf_s(bufp, bufLength, "arg%d in out+%02x%c", curArgTabEntry->argNum, - curArgTabEntry->slotNum * TARGET_POINTER_SIZE, 0); + sprintf_s(bufp, bufLength, "arg%d in out+%02x%c", curArgTabEntry->argNum, curArgTabEntry->GetByteOffset(), 0); } else #endif diff --git a/src/coreclr/src/jit/gentree.h b/src/coreclr/src/jit/gentree.h index 687a27134af18..6e74955d61473 100644 --- a/src/coreclr/src/jit/gentree.h +++ b/src/coreclr/src/jit/gentree.h @@ -5991,48 +5991,93 @@ struct GenTreePhiArg : public GenTreeLclVarCommon struct GenTreePutArgStk : public GenTreeUnOp { +private: + unsigned m_byteOffset; +#ifdef FEATURE_PUT_STRUCT_ARG_STK + unsigned m_byteSize; // The number of bytes that this argument is occupying on the stack with padding. +#endif + +public: +#if defined(DEBUG_ARG_SLOTS) unsigned gtSlotNum; // Slot number of the argument to be passed on stack +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + unsigned gtNumSlots; // Number of slots for the argument to be passed on stack +#endif +#endif + #if defined(UNIX_X86_ABI) unsigned gtPadAlign; // Number of padding slots for stack alignment #endif +#if defined(DEBUG) || defined(UNIX_X86_ABI) + GenTreeCall* gtCall; // the call node to which this argument belongs +#endif - // Don't let clang-format mess with the GenTreePutArgStk constructor. - // clang-format off +#if FEATURE_FASTTAILCALL + + bool gtPutInIncomingArgArea; // Whether this arg needs to be placed in incoming arg area. + // By default this is false and will be placed in out-going arg area. + // Fast tail calls set this to true. + // In future if we need to add more such bool fields consider bit fields. +#endif - GenTreePutArgStk(genTreeOps oper, - var_types type, +#ifdef FEATURE_PUT_STRUCT_ARG_STK + // Instruction selection: during codegen time, what code sequence we will be using + // to encode this operation. + // TODO-Throughput: The following information should be obtained from the child + // block node. + + enum class Kind : __int8{ + Invalid, RepInstr, Unroll, Push, PushAllSlots, + }; + Kind gtPutArgStkKind; +#endif + + GenTreePutArgStk(genTreeOps oper, + var_types type, GenTree* op1, - unsigned slotNum - PUT_STRUCT_ARG_STK_ONLY_ARG(unsigned numSlots), - bool putInIncomingArgArea = false, - GenTreeCall* callNode = nullptr) + unsigned stackByteOffset, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + unsigned stackByteSize, +#endif +#if defined(DEBUG_ARG_SLOTS) + unsigned slotNum, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + unsigned numSlots, +#endif +#endif + GenTreeCall* callNode, + bool putInIncomingArgArea) : GenTreeUnOp(oper, type, op1 DEBUGARG(/*largeNode*/ false)) + , m_byteOffset(stackByteOffset) +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + , m_byteSize(stackByteSize) +#endif +#if defined(DEBUG_ARG_SLOTS) , gtSlotNum(slotNum) +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + , gtNumSlots(numSlots) +#endif +#endif #if defined(UNIX_X86_ABI) , gtPadAlign(0) #endif +#if defined(DEBUG) || defined(UNIX_X86_ABI) + , gtCall(callNode) +#endif #if FEATURE_FASTTAILCALL , gtPutInIncomingArgArea(putInIncomingArgArea) #endif // FEATURE_FASTTAILCALL -#ifdef FEATURE_PUT_STRUCT_ARG_STK +#if defined(FEATURE_PUT_STRUCT_ARG_STK) , gtPutArgStkKind(Kind::Invalid) - , gtNumSlots(numSlots) -#endif // FEATURE_PUT_STRUCT_ARG_STK -#if defined(DEBUG) || defined(UNIX_X86_ABI) - , gtCall(callNode) #endif { + DEBUG_ARG_SLOTS_ASSERT(m_byteOffset == slotNum * TARGET_POINTER_SIZE); +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + DEBUG_ARG_SLOTS_ASSERT(m_byteSize == gtNumSlots * TARGET_POINTER_SIZE); +#endif } -// clang-format on - #if FEATURE_FASTTAILCALL - - bool gtPutInIncomingArgArea; // Whether this arg needs to be placed in incoming arg area. - // By default this is false and will be placed in out-going arg area. - // Fast tail calls set this to true. - // In future if we need to add more such bool fields consider bit fields. - bool putInIncomingArgArea() const { return gtPutInIncomingArgArea; @@ -6049,7 +6094,9 @@ struct GenTreePutArgStk : public GenTreeUnOp unsigned getArgOffset() const { - return gtSlotNum * TARGET_POINTER_SIZE; + DEBUG_ARG_SLOTS_ASSERT(m_byteOffset / TARGET_POINTER_SIZE == gtSlotNum); + DEBUG_ARG_SLOTS_ASSERT(m_byteOffset % TARGET_POINTER_SIZE == 0); + return m_byteOffset; } #if defined(UNIX_X86_ABI) @@ -6065,43 +6112,26 @@ struct GenTreePutArgStk : public GenTreeUnOp #endif #ifdef FEATURE_PUT_STRUCT_ARG_STK - unsigned getArgSize() const + unsigned GetStackByteSize() const { - return gtNumSlots * TARGET_POINTER_SIZE; + return m_byteSize; } // Return true if this is a PutArgStk of a SIMD12 struct. // This is needed because such values are re-typed to SIMD16, and the type of PutArgStk is VOID. unsigned isSIMD12() const { - return (varTypeIsSIMD(gtOp1) && (gtNumSlots == 3)); + return (varTypeIsSIMD(gtOp1) && (GetStackByteSize() == 12)); } - // Instruction selection: during codegen time, what code sequence we will be using - // to encode this operation. - // TODO-Throughput: The following information should be obtained from the child - // block node. - - enum class Kind : __int8{ - Invalid, RepInstr, Unroll, Push, PushAllSlots, - }; - - Kind gtPutArgStkKind; bool isPushKind() const { return (gtPutArgStkKind == Kind::Push) || (gtPutArgStkKind == Kind::PushAllSlots); } - - unsigned gtNumSlots; // Number of slots for the argument to be passed on stack - #else // !FEATURE_PUT_STRUCT_ARG_STK - unsigned getArgSize(); + unsigned GetStackByteSize() const; #endif // !FEATURE_PUT_STRUCT_ARG_STK -#if defined(DEBUG) || defined(UNIX_X86_ABI) - GenTreeCall* gtCall; // the call node to which this argument belongs -#endif - #if DEBUGGABLE_GENTREE GenTreePutArgStk() : GenTreeUnOp() { @@ -6116,16 +6146,34 @@ struct GenTreePutArgSplit : public GenTreePutArgStk unsigned gtNumRegs; GenTreePutArgSplit(GenTree* op1, - unsigned slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(unsigned numSlots), + unsigned stackByteOffset, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + unsigned stackByteSize, +#endif +#if defined(DEBUG_ARG_SLOTS) + unsigned slotNum, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + unsigned numSlots, +#endif +#endif unsigned numRegs, - bool putIncomingArgArea = false, - GenTreeCall* callNode = nullptr) + GenTreeCall* callNode, + bool putIncomingArgArea) : GenTreePutArgStk(GT_PUTARG_SPLIT, TYP_STRUCT, op1, - slotNum PUT_STRUCT_ARG_STK_ONLY_ARG(numSlots), - putIncomingArgArea, - callNode) + stackByteOffset, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + stackByteSize, +#endif +#if defined(DEBUG_ARG_SLOTS) + slotNum, +#if defined(FEATURE_PUT_STRUCT_ARG_STK) + numSlots, +#endif +#endif + callNode, + putIncomingArgArea) , gtNumRegs(numRegs) { ClearOtherRegs(); @@ -6246,13 +6294,6 @@ struct GenTreePutArgSplit : public GenTreePutArgStk gtSpillFlags = 0; } -#ifdef FEATURE_PUT_STRUCT_ARG_STK - unsigned getArgSize() const - { - return (gtNumSlots + gtNumRegs) * TARGET_POINTER_SIZE; - } -#endif // FEATURE_PUT_STRUCT_ARG_STK - #if DEBUGGABLE_GENTREE GenTreePutArgSplit() : GenTreePutArgStk() { diff --git a/src/coreclr/src/jit/lclvars.cpp b/src/coreclr/src/jit/lclvars.cpp index a97e81da21617..548dc8529e39e 100644 --- a/src/coreclr/src/jit/lclvars.cpp +++ b/src/coreclr/src/jit/lclvars.cpp @@ -605,11 +605,12 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo) // ARM softfp calling convention should affect only the floating point arguments. // Otherwise there appear too many surplus pre-spills and other memory operations // with the associated locations . - bool isSoftFPPreSpill = opts.compUseSoftFP && varTypeIsFloating(varDsc->TypeGet()); - unsigned argSize = eeGetArgSize(argLst, &info.compMethodInfo->args); - unsigned cSlots = argSize / TARGET_POINTER_SIZE; // the total number of slots of this argument - bool isHfaArg = false; - var_types hfaType = TYP_UNDEF; + bool isSoftFPPreSpill = opts.compUseSoftFP && varTypeIsFloating(varDsc->TypeGet()); + unsigned argSize = eeGetArgSize(argLst, &info.compMethodInfo->args); + unsigned cSlots = + (argSize + TARGET_POINTER_SIZE - 1) / TARGET_POINTER_SIZE; // the total number of slots of this argument + bool isHfaArg = false; + var_types hfaType = TYP_UNDEF; #if defined(TARGET_ARM64) && defined(TARGET_UNIX) // Native varargs on arm64 unix use the regular calling convention. @@ -1015,7 +1016,11 @@ void Compiler::lvaInitUserArgs(InitVarDscInfo* varDscInfo) #if FEATURE_FASTTAILCALL varDsc->SetStackOffset(varDscInfo->stackArgSize); +#if defined(OSX_ARM64_ABI) + varDscInfo->stackArgSize += argSize; +#else varDscInfo->stackArgSize += roundUp(argSize, TARGET_POINTER_SIZE); +#endif #endif // FEATURE_FASTTAILCALL } @@ -5254,7 +5259,9 @@ void Compiler::lvaAssignVirtualFrameOffsetsToArgs() /* Update the argOffs to reflect arguments that are passed in registers */ noway_assert(codeGen->intRegState.rsCalleeRegArgCount <= MAX_REG_ARG); +#if !defined(OSX_ARM64_ABI) noway_assert(compArgSize >= codeGen->intRegState.rsCalleeRegArgCount * REGSIZE_BYTES); +#endif #ifdef TARGET_X86 argOffs -= codeGen->intRegState.rsCalleeRegArgCount * REGSIZE_BYTES; diff --git a/src/coreclr/src/jit/lower.cpp b/src/coreclr/src/jit/lower.cpp index d047083e763f3..fe50b6c405dee 100644 --- a/src/coreclr/src/jit/lower.cpp +++ b/src/coreclr/src/jit/lower.cpp @@ -1041,17 +1041,20 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf #endif // TARGET_ARM } - unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE; -#if defined(FEATURE_PUT_STRUCT_ARG_STK) - unsigned numberOfStackSlots = info->GetStackSlotsNumber(); - DEBUG_ARG_SLOTS_ASSERT(numberOfStackSlots == info->numSlots); -#endif + const unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE; DEBUG_ARG_SLOTS_ASSERT(slotNumber == info->slotNum); + const bool putInIncomingArgArea = call->IsFastTailCall(); putArg = new (comp, GT_PUTARG_SPLIT) - GenTreePutArgSplit(arg, slotNumber PUT_STRUCT_ARG_STK_ONLY_ARG(numberOfStackSlots), info->numRegs, - call->IsFastTailCall(), call); - + GenTreePutArgSplit(arg, info->GetByteOffset(), +#if defined(DEBUG_ARG_SLOTS) && defined(FEATURE_PUT_STRUCT_ARG_STK) + info->GetStackByteSize(), slotNumber, info->GetStackSlotsNumber(), +#elif defined(DEBUG_ARG_SLOTS) && !defined(FEATURE_PUT_STRUCT_ARG_STK) + slotNumber, +#elif !defined(DEBUG_ARG_SLOTS) && defined(FEATURE_PUT_STRUCT_ARG_STK) + info->GetStackByteSize(), +#endif + info->numRegs, call, putInIncomingArgArea); // If struct argument is morphed to GT_FIELD_LIST node(s), // we can know GC info by type of each GT_FIELD_LIST node. // So we skip setting GC Pointer info. @@ -1148,7 +1151,7 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf #if defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK) if (type == TYP_SIMD12) { - assert(info->numSlots == 3); + assert(info->GetByteSize() == 12); } else #endif // defined(FEATURE_SIMD) && defined(FEATURE_PUT_STRUCT_ARG_STK) @@ -1156,17 +1159,19 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf assert(genActualType(arg->TypeGet()) == type); } } - unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE; -#if defined(FEATURE_PUT_STRUCT_ARG_STK) - unsigned numberOfStackSlots = info->GetStackSlotsNumber(); - DEBUG_ARG_SLOTS_ASSERT(numberOfStackSlots == info->numSlots); + const unsigned slotNumber = info->GetByteOffset() / TARGET_POINTER_SIZE; + const bool putInIncomingArgArea = call->IsFastTailCall(); + + putArg = new (comp, GT_PUTARG_STK) + GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, info->GetByteOffset(), +#if defined(DEBUG_ARG_SLOTS) && defined(FEATURE_PUT_STRUCT_ARG_STK) + info->GetStackByteSize(), slotNumber, info->GetStackSlotsNumber(), +#elif defined(DEBUG_ARG_SLOTS) && !defined(FEATURE_PUT_STRUCT_ARG_STK) + slotNumber, +#elif !defined(DEBUG_ARG_SLOTS) && defined(FEATURE_PUT_STRUCT_ARG_STK) + info->GetStackByteSize(), #endif - DEBUG_ARG_SLOTS_ASSERT(slotNumber == info->slotNum); - - putArg = - new (comp, GT_PUTARG_STK) GenTreePutArgStk(GT_PUTARG_STK, TYP_VOID, arg, - slotNumber PUT_STRUCT_ARG_STK_ONLY_ARG(numberOfStackSlots), - call->IsFastTailCall(), call); + call, putInIncomingArgArea); #ifdef FEATURE_PUT_STRUCT_ARG_STK // If the ArgTabEntry indicates that this arg is a struct @@ -1227,7 +1232,7 @@ GenTree* Lowering::NewPutArg(GenTreeCall* call, GenTree* arg, fgArgTabEntry* inf } else if (!arg->OperIs(GT_FIELD_LIST)) { - assert(varTypeIsSIMD(arg) || (info->numSlots == 1)); + assert(varTypeIsSIMD(arg) || (info->GetStackSlotsNumber() == 1)); } } #endif // FEATURE_PUT_STRUCT_ARG_STK @@ -1892,7 +1897,7 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) GenTreePutArgStk* put = putargs.Bottom(i)->AsPutArgStk(); unsigned int overwrittenStart = put->getArgOffset(); - unsigned int overwrittenEnd = overwrittenStart + put->getArgSize(); + unsigned int overwrittenEnd = overwrittenStart + put->GetStackByteSize(); #if !(defined(TARGET_WINDOWS) && defined(TARGET_AMD64)) int baseOff = -1; // Stack offset of first arg on stack #endif @@ -4222,8 +4227,7 @@ void Lowering::InsertPInvokeCallProlog(GenTreeCall* call) #if !defined(TARGET_64BIT) // On 32-bit targets, indirect calls need the size of the stack args in InlinedCallFrame.m_Datum. const unsigned stackByteOffset = call->fgArgInfo->GetNextSlotByteOffset(); - - src = comp->gtNewIconNode(stackByteOffset, TYP_INT); + src = comp->gtNewIconNode(stackByteOffset, TYP_INT); #else // On 64-bit targets, indirect calls may need the stub parameter value in InlinedCallFrame.m_Datum. // If the stub parameter value is not needed, m_Datum will be initialized by the VM. diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp index 8a02933bfc640..a121181495135 100644 --- a/src/coreclr/src/jit/lowerxarch.cpp +++ b/src/coreclr/src/jit/lowerxarch.cpp @@ -397,7 +397,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // Now that the fields have been sorted, the kind of code we will generate. bool allFieldsAreSlots = true; - unsigned prevOffset = putArgStk->getArgSize(); + unsigned prevOffset = putArgStk->GetStackByteSize(); for (GenTreeFieldList::Use& use : fieldList->Uses()) { GenTree* const fieldNode = use.GetNode(); @@ -520,7 +520,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // The cpyXXXX code is rather complex and this could cause it to be more complex, but // it might be the right thing to do. - ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE; + unsigned size = putArgStk->GetStackByteSize(); // TODO-X86-CQ: The helper call either is not supported on x86 or required more work // (I don't know which). diff --git a/src/coreclr/src/jit/lsraxarch.cpp b/src/coreclr/src/jit/lsraxarch.cpp index 02f739d15ddc6..db787c55fdf28 100644 --- a/src/coreclr/src/jit/lsraxarch.cpp +++ b/src/coreclr/src/jit/lsraxarch.cpp @@ -1488,7 +1488,7 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) RefPosition* simdTemp = nullptr; RefPosition* intTemp = nullptr; - unsigned prevOffset = putArgStk->getArgSize(); + unsigned prevOffset = putArgStk->GetStackByteSize(); // We need to iterate over the fields twice; once to determine the need for internal temps, // and once to actually build the uses. for (GenTreeFieldList::Use& use : putArgStk->gtOp1->AsFieldList()->Uses()) @@ -1571,27 +1571,16 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) ClassLayout* layout = src->AsObj()->GetLayout(); - ssize_t size = putArgStk->gtNumSlots * TARGET_POINTER_SIZE; + ssize_t size = putArgStk->GetStackByteSize(); switch (putArgStk->gtPutArgStkKind) { case GenTreePutArgStk::Kind::Push: case GenTreePutArgStk::Kind::PushAllSlots: case GenTreePutArgStk::Kind::Unroll: // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - // - // x86 specific note: if the size is odd, the last copy operation would be of size 1 byte. - // But on x86 only RBM_BYTE_REGS could be used as byte registers. Therefore, exclude - // RBM_NON_BYTE_REGS from internal candidates. if (!layout->HasGCPtr() && (size & (XMM_REGSIZE_BYTES - 1)) != 0) { regMaskTP regMask = allRegs(TYP_INT); - -#ifdef TARGET_X86 - if ((size % 2) != 0) - { - regMask &= ~RBM_NON_BYTE_REGS; - } -#endif buildInternalIntRegisterDefForNode(putArgStk, regMask); } diff --git a/src/coreclr/src/jit/morph.cpp b/src/coreclr/src/jit/morph.cpp index 2679b45683ba4..4d3b852172864 100644 --- a/src/coreclr/src/jit/morph.cpp +++ b/src/coreclr/src/jit/morph.cpp @@ -1187,17 +1187,10 @@ void fgArgInfo::UpdateStkArg(fgArgTabEntry* curArgTabEntry, GenTree* node, bool assert(curArgTabEntry->slotNum == nextSlotNum); nextSlotNum += curArgTabEntry->numSlots; #endif + nextStackByteOffset = roundUp(nextStackByteOffset, curArgTabEntry->byteAlignment); assert(curArgTabEntry->GetByteOffset() == nextStackByteOffset); - - if (!curArgTabEntry->IsSplit()) - { - nextStackByteOffset += curArgTabEntry->GetByteSize(); - } - else - { - nextStackByteOffset += curArgTabEntry->GetStackByteSize(); - } + nextStackByteOffset += curArgTabEntry->GetStackByteSize(); } void fgArgInfo::SplitArg(unsigned argNum, unsigned numRegs, unsigned numSlots)