From 4d768d6c50ee09082f0c78d2637f398f104e9496 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 26 Jul 2024 12:02:52 +0200 Subject: [PATCH 1/2] JIT: Fix placement of `GT_START_NOGC` for tailcalls in face of bulk copy with write barrier calls When the JIT generates code for a tailcall it must generate code to write the arguments into the incoming parameter area. Since the GC ness of the arguments of the tailcall may not match the GC ness of the parameters, we have to disable GC before we start writing these. This is done by finding the earliest `GT_PUTARG_STK` node and placing the start of the NOGC region right before it. In addition, there is logic to take care of potential overlap between the arguments and parameters. For example, if the call has an operand that uses one of the parameters, then we must take care that we do not override that parameter with the tailcall argument before the use of it. To do so, we sometimes may need to introduce copies from the parameter locals to locals on the stack frame. This used to work fine, however, with #101761 we started transforming block copies into managed calls in certain scenarios. It was possible for the JIT to decide to introduce a copy to a local and for this transformation to then kick in. This would cause us to end up with the managed helper call after starting the nogc region. In checked builds this would hit an assert during GC scan; in release builds, it would end up with corrupted data. The fix here is to make sure we insert the `GT_START_NOGC` after all the potential temporary copies we may introduce as part of the tailcat stll logic. There was an additional assumption that the first `PUTARG_STK` operand was the earliest one in execution order. That is not guaranteed, so this change stops relying on that as well by introducing a new `LIR::FirstNode` and using that to determine the earliest `PUTARG_STK` node. Fix #102370 Fix #104123 Fix #105441 --- src/coreclr/jit/lir.cpp | 16 +++++++++ src/coreclr/jit/lir.h | 1 + src/coreclr/jit/lower.cpp | 73 ++++++++++++++++++--------------------- 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/src/coreclr/jit/lir.cpp b/src/coreclr/jit/lir.cpp index feabec03d02ca..44570ea986404 100644 --- a/src/coreclr/jit/lir.cpp +++ b/src/coreclr/jit/lir.cpp @@ -1872,6 +1872,22 @@ GenTree* LIR::LastNode(GenTree** nodes, size_t numNodes) return lastNode; } +//------------------------------------------------------------------------ +// LIR::FirstNode: +// Given two nodes in the same block range, find which node appears first. +// +// Arguments: +// node1 - The first node +// node2 - The second node +// +// Returns: +// Node that appears first. +// +GenTree* LIR::FirstNode(GenTree* node1, GenTree* node2) +{ + return LastNode(node1, node2) == node1 ? node2 : node1; +} + #ifdef DEBUG void GenTree::dumpLIRFlags() { diff --git a/src/coreclr/jit/lir.h b/src/coreclr/jit/lir.h index 8a3a9a507a38b..a3271e832fa8d 100644 --- a/src/coreclr/jit/lir.h +++ b/src/coreclr/jit/lir.h @@ -317,6 +317,7 @@ class LIR final static GenTree* LastNode(GenTree* node1, GenTree* node2); static GenTree* LastNode(GenTree** nodes, size_t numNodes); + static GenTree* FirstNode(GenTree* node1, GenTree* node2); }; inline void GenTree::SetUnusedValue() diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 1eafc393d498a..d537f56e34c61 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -3060,52 +3060,22 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) // call could over-write the stack arg that is setup earlier. ArrayStack putargs(comp->getAllocator(CMK_ArrayStack)); - for (CallArg& arg : call->gtArgs.EarlyArgs()) - { - if (arg.GetEarlyNode()->OperIs(GT_PUTARG_STK)) - { - putargs.Push(arg.GetEarlyNode()); - } - } - - for (CallArg& arg : call->gtArgs.LateArgs()) + for (CallArg& arg : call->gtArgs.Args()) { - if (arg.GetLateNode()->OperIs(GT_PUTARG_STK)) + if (arg.GetNode()->OperIs(GT_PUTARG_STK)) { - putargs.Push(arg.GetLateNode()); + putargs.Push(arg.GetNode()); } } GenTree* startNonGCNode = nullptr; - if (!putargs.Empty()) + if (putargs.Height() > 0) { - // Get the earliest operand of the first PUTARG_STK node. We will make - // the required copies of args before this node. - bool unused; - GenTree* insertionPoint = BlockRange().GetTreeRange(putargs.Bottom(), &unused).FirstNode(); - // Insert GT_START_NONGC node before we evaluate the PUTARG_STK args. - // Note that if there are no args to be setup on stack, no need to - // insert GT_START_NONGC node. - startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID); - BlockRange().InsertBefore(insertionPoint, startNonGCNode); - - // Gc-interruptability in the following case: - // foo(a, b, c, d, e) { bar(a, b, c, d, e); } - // bar(a, b, c, d, e) { foo(a, b, d, d, e); } - // - // Since the instruction group starting from the instruction that sets up first - // stack arg to the end of the tail call is marked as non-gc interruptible, - // this will form a non-interruptible tight loop causing gc-starvation. To fix - // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method - // has a single basic block and is not a GC-safe point. The presence of a single - // nop outside non-gc interruptible region will prevent gc starvation. - if ((comp->fgBBcount == 1) && !comp->compCurBB->HasFlag(BBF_GC_SAFE_POINT)) + GenTree* firstPutargStk = putargs.Bottom(0); + for (int i = 1; i < putargs.Height(); i++) { - assert(comp->fgFirstBB == comp->compCurBB); - GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID); - BlockRange().InsertBefore(startNonGCNode, noOp); + firstPutargStk = LIR::FirstNode(firstPutargStk, putargs.Bottom(i)); } - // Since this is a fast tailcall each PUTARG_STK will place the argument in the // _incoming_ arg space area. This will effectively overwrite our already existing // incoming args that live in that area. If we have later uses of those args, this @@ -3175,10 +3145,10 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) GenTree* lookForUsesFrom = put->gtNext; if (overwrittenStart != argStart) { - lookForUsesFrom = insertionPoint; + lookForUsesFrom = firstPutargStk; } - RehomeArgForFastTailCall(callerArgLclNum, insertionPoint, lookForUsesFrom, call); + RehomeArgForFastTailCall(callerArgLclNum, firstPutargStk, lookForUsesFrom, call); // The above call can introduce temps and invalidate the pointer. callerArgDsc = comp->lvaGetDesc(callerArgLclNum); @@ -3192,10 +3162,33 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) unsigned int fieldsEnd = fieldsFirst + callerArgDsc->lvFieldCnt; for (unsigned int j = fieldsFirst; j < fieldsEnd; j++) { - RehomeArgForFastTailCall(j, insertionPoint, lookForUsesFrom, call); + RehomeArgForFastTailCall(j, firstPutargStk, lookForUsesFrom, call); } } } + + // Now insert GT_START_NONGC node before we evaluate the first PUTARG_STK. + // Note that if there are no args to be setup on stack, no need to + // insert GT_START_NONGC node. + startNonGCNode = new (comp, GT_START_NONGC) GenTree(GT_START_NONGC, TYP_VOID); + BlockRange().InsertBefore(firstPutargStk, startNonGCNode); + + // Gc-interruptability in the following case: + // foo(a, b, c, d, e) { bar(a, b, c, d, e); } + // bar(a, b, c, d, e) { foo(a, b, d, d, e); } + // + // Since the instruction group starting from the instruction that sets up first + // stack arg to the end of the tail call is marked as non-gc interruptible, + // this will form a non-interruptible tight loop causing gc-starvation. To fix + // this we insert GT_NO_OP as embedded stmt before GT_START_NONGC, if the method + // has a single basic block and is not a GC-safe point. The presence of a single + // nop outside non-gc interruptible region will prevent gc starvation. + if ((comp->fgBBcount == 1) && !comp->compCurBB->HasFlag(BBF_GC_SAFE_POINT)) + { + assert(comp->fgFirstBB == comp->compCurBB); + GenTree* noOp = new (comp, GT_NO_OP) GenTree(GT_NO_OP, TYP_VOID); + BlockRange().InsertBefore(startNonGCNode, noOp); + } } // Insert GT_PROF_HOOK node to emit profiler tail call hook. This should be From 4aa09b2517033f69f37af842ceba2cc27d0d1cc8 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Fri, 26 Jul 2024 12:36:26 +0200 Subject: [PATCH 2/2] Nit --- src/coreclr/jit/lower.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d537f56e34c61..aa0581a8ceb39 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -3069,7 +3069,7 @@ void Lowering::LowerFastTailCall(GenTreeCall* call) } GenTree* startNonGCNode = nullptr; - if (putargs.Height() > 0) + if (!putargs.Empty()) { GenTree* firstPutargStk = putargs.Bottom(0); for (int i = 1; i < putargs.Height(); i++)