From 18bd85e7b4114dc8307cf2f76ba19ac8604ca51b Mon Sep 17 00:00:00 2001
From: Shuhei Kadowaki <aviatesk@gmail.com>
Date: Tue, 6 Sep 2022 23:23:53 +0900
Subject: [PATCH] inlining: relax `finalizer` inlining control-flow restriction

Eager `finalizer` inlining (#45272) currently has a restriction that
requires all the def/uses to be in a same basic block.

This commit relaxes that restriction a bit by allowing def/uses to
involve control flow when all of them are dominated by a `finalizer`
call to be inlined, since in that case it is safe to insert the body of
`finalizer` at the end of all the def/uses, e.g.
```julia
const FINALIZATION_COUNT = Ref(0)
init_finalization_count!() = FINALIZATION_COUNT[] = 0
get_finalization_count() = FINALIZATION_COUNT[]
@noinline add_finalization_count!(x) = FINALIZATION_COUNT[] += x
@noinline Base.@assume_effects :nothrow safeprint(io::IO, x...) = (@nospecialize; print(io, x...))
mutable struct DoAllocWithFieldInter
    x::Int
end
function register_finalizer!(obj::DoAllocWithFieldInter)
    finalizer(obj) do this
        add_finalization_count!(this.x)
    end
end

function cfg_finalization3(io)
    for i = -999:1000
        o = DoAllocWithFieldInter(i)
        register_finalizer!(o)
        if i == 1000
            safeprint(io, o.x, '\n')
        elseif i > 0
            safeprint(io, o.x)
        end
    end
end
let src = code_typed1(cfg_finalization3, (IO,))
    @test count(isinvoke(:add_finalization_count!), src.code) == 1
end
let
    init_finalization_count!()
    cfg_finalization3(IOBuffer())
    @test get_finalization_count() == 1000
end
```

To support this transformation, the domtree code also gains the ability
to represent post-dominator trees, which is generally useful.

Co-authored-by: Keno Fischer <keno@juliacomputing.com>
---
 base/compiler/ssair/domtree.jl | 137 ++++++++++++++++++++++-------
 base/compiler/ssair/passes.jl  | 152 +++++++++++++++++++++++++--------
 test/compiler/inline.jl        | 110 ++++++++++++++++++++++++
 test/compiler/irutils.jl       |   7 +-
 test/compiler/ssair.jl         |   5 +-
 5 files changed, 340 insertions(+), 71 deletions(-)

diff --git a/base/compiler/ssair/domtree.jl b/base/compiler/ssair/domtree.jl
index 59016080b52040..b1fd68483041d6 100644
--- a/base/compiler/ssair/domtree.jl
+++ b/base/compiler/ssair/domtree.jl
@@ -109,10 +109,16 @@ end
 
 length(D::DFSTree) = length(D.from_pre)
 
-function DFS!(D::DFSTree, blocks::Vector{BasicBlock})
+function DFS!(D::DFSTree, blocks::Vector{BasicBlock}, is_post_dominator::Bool)
     copy!(D, DFSTree(length(blocks)))
-    to_visit = Tuple{BBNumber, PreNumber, Bool}[(1, 0, false)]
-    pre_num = 1
+    if is_post_dominator
+        # TODO: We're using -1 as the virtual exit node here. Would it make
+        #       sense to actually have a real BB for the exit always?
+        to_visit = Tuple{BBNumber, PreNumber, Bool}[(-1, 0, false)]
+    else
+        to_visit = Tuple{BBNumber, PreNumber, Bool}[(1, 0, false)]
+    end
+    pre_num = is_post_dominator ? 0 : 1
     post_num = 1
     while !isempty(to_visit)
         # Because we want the postorder number as well as the preorder number,
@@ -123,12 +129,14 @@ function DFS!(D::DFSTree, blocks::Vector{BasicBlock})
         if pushed_children
             # Going up the DFS tree, so all we need to do is record the
             # postorder number, then move on
-            D.to_post[current_node_bb] = post_num
-            D.from_post[post_num] = current_node_bb
+            if current_node_bb != -1
+                D.to_post[current_node_bb] = post_num
+                D.from_post[post_num] = current_node_bb
+            end
             post_num += 1
             pop!(to_visit)
 
-        elseif D.to_pre[current_node_bb] != 0
+        elseif current_node_bb != -1 && D.to_pre[current_node_bb] != 0
             # Node has already been visited, move on
             pop!(to_visit)
             continue
@@ -136,15 +144,24 @@ function DFS!(D::DFSTree, blocks::Vector{BasicBlock})
             # Going down the DFS tree
 
             # Record preorder number
-            D.to_pre[current_node_bb] = pre_num
-            D.from_pre[pre_num] = current_node_bb
-            D.to_parent_pre[pre_num] = parent_pre
+            if current_node_bb != -1
+                D.to_pre[current_node_bb] = pre_num
+                D.from_pre[pre_num] = current_node_bb
+                D.to_parent_pre[pre_num] = parent_pre
+            end
 
             # Record that children (will) have been pushed
             to_visit[end] = (current_node_bb, parent_pre, true)
 
+            if is_post_dominator && current_node_bb == -1
+                edges = Int[bb for bb in 1:length(blocks) if isempty(blocks[bb].succs)]
+            else
+                edges = is_post_dominator ? blocks[current_node_bb].preds :
+                                            blocks[current_node_bb].succs
+            end
+
             # Push children to the stack
-            for succ_bb in blocks[current_node_bb].succs
+            for succ_bb in edges
                 push!(to_visit, (succ_bb, pre_num, false))
             end
 
@@ -161,7 +178,7 @@ function DFS!(D::DFSTree, blocks::Vector{BasicBlock})
     return D
 end
 
-DFS(blocks::Vector{BasicBlock}) = DFS!(DFSTree(0), blocks)
+DFS(blocks::Vector{BasicBlock}, is_post_dominator::Bool=false) = DFS!(DFSTree(0), blocks, is_post_dominator)
 
 """
 Keeps the per-BB state of the Semi NCA algorithm. In the original formulation,
@@ -184,7 +201,7 @@ end
 DomTreeNode() = DomTreeNode(1, Vector{BBNumber}())
 
 "Data structure that encodes which basic block dominates which."
-struct DomTree
+struct GenericDomTree{IsPostDom}
     # These can be reused when updating domtree dynamically
     dfs_tree::DFSTree
     snca_state::Vector{SNCAData}
@@ -195,19 +212,25 @@ struct DomTree
     # The nodes in the tree (ordered by BB indices)
     nodes::Vector{DomTreeNode}
 end
+const DomTree = GenericDomTree{false}
+const PostDomTree = GenericDomTree{true}
 
-function DomTree()
-    return DomTree(DFSTree(0), SNCAData[], BBNumber[], DomTreeNode[])
+function (T::Type{<:GenericDomTree})()
+    return T(DFSTree(0), SNCAData[], BBNumber[], DomTreeNode[])
 end
 
 function construct_domtree(blocks::Vector{BasicBlock})
     return update_domtree!(blocks, DomTree(), true, 0)
 end
 
-function update_domtree!(blocks::Vector{BasicBlock}, domtree::DomTree,
-                         recompute_dfs::Bool, max_pre::PreNumber)
+function construct_postdomtree(blocks::Vector{BasicBlock})
+    return update_domtree!(blocks, PostDomTree(), true, 0)
+end
+
+function update_domtree!(blocks::Vector{BasicBlock}, domtree::GenericDomTree{IsPostDom},
+                         recompute_dfs::Bool, max_pre::PreNumber) where {IsPostDom}
     if recompute_dfs
-        DFS!(domtree.dfs_tree, blocks)
+        DFS!(domtree.dfs_tree, blocks, IsPostDom)
     end
 
     if max_pre == 0
@@ -219,17 +242,24 @@ function update_domtree!(blocks::Vector{BasicBlock}, domtree::DomTree,
     return domtree
 end
 
-function compute_domtree_nodes!(domtree::DomTree)
+function compute_domtree_nodes!(domtree::GenericDomTree{IsPostDom}) where {IsPostDom}
     # Compute children
     copy!(domtree.nodes,
           DomTreeNode[DomTreeNode() for _ in 1:length(domtree.idoms_bb)])
     for (idx, idom) in Iterators.enumerate(domtree.idoms_bb)
-        (idx == 1 || idom == 0) && continue
+        ((!IsPostDom && idx == 1) || idom == 0) && continue
         push!(domtree.nodes[idom].children, idx)
     end
     # n.b. now issorted(domtree.nodes[*].children) since idx is sorted above
     # Recursively set level
-    update_level!(domtree.nodes, 1, 1)
+    if IsPostDom
+        for (node, idom) in enumerate(domtree.idoms_bb)
+            idom == 0 || continue
+            update_level!(domtree.nodes, node, 1)
+        end
+    else
+        update_level!(domtree.nodes, 1, 1)
+    end
     return domtree.nodes
 end
 
@@ -244,13 +274,18 @@ function update_level!(nodes::Vector{DomTreeNode}, node::BBNumber, level::Int)
     end
 end
 
+dom_edges(domtree::DomTree, blocks::Vector{BasicBlock}, idx::BBNumber) =
+    blocks[idx].preds
+dom_edges(domtree::PostDomTree, blocks::Vector{BasicBlock}, idx::BBNumber) =
+    blocks[idx].succs
+
 """
 The main Semi-NCA algorithm. Matches Figure 2.8 in [LG05]. Note that the
 pseudocode in [LG05] is not entirely accurate. The best way to understand
 what's happening is to read [LT79], then the description of SLT in [LG05]
 (warning: inconsistent notation), then the description of Semi-NCA.
 """
-function SNCA!(domtree::DomTree, blocks::Vector{BasicBlock}, max_pre::PreNumber)
+function SNCA!(domtree::GenericDomTree{IsPostDom}, blocks::Vector{BasicBlock}, max_pre::PreNumber) where {IsPostDom}
     D = domtree.dfs_tree
     state = domtree.snca_state
     # There may be more blocks than are reachable in the DFS / dominator tree
@@ -289,13 +324,14 @@ function SNCA!(domtree::DomTree, blocks::Vector{BasicBlock}, max_pre::PreNumber)
     # Calculate semidominators, but only for blocks with preorder number up to
     # max_pre
     ancestors = copy(D.to_parent_pre)
-    for w::PreNumber in reverse(2:max_pre)
+    relevant_blocks = IsPostDom ? (1:max_pre) : (2:max_pre)
+    for w::PreNumber in reverse(relevant_blocks)
         # LLVM initializes this to the parent, the paper initializes this to
         # `w`, but it doesn't really matter (the parent is a predecessor, so at
         # worst we'll discover it below). Save a memory reference here.
         semi_w = typemax(PreNumber)
         last_linked = PreNumber(w + 1)
-        for v ∈ blocks[D.from_pre[w]].preds
+        for v ∈ dom_edges(domtree, blocks, D.from_pre[w])
             # For the purpose of the domtree, ignore virtual predecessors into
             # catch blocks.
             v == 0 && continue
@@ -331,7 +367,7 @@ function SNCA!(domtree::DomTree, blocks::Vector{BasicBlock}, max_pre::PreNumber)
     # ancestor in the (immediate) dominator tree between its semidominator and
     # its parent (see Lemma 2.6 in [LG05]).
     idoms_pre = copy(D.to_parent_pre)
-    for v in 2:n_nodes
+    for v in (IsPostDom ? (1:n_nodes) : (2:n_nodes))
         idom = idoms_pre[v]
         vsemi = state[v].semi
         while idom > vsemi
@@ -343,10 +379,11 @@ function SNCA!(domtree::DomTree, blocks::Vector{BasicBlock}, max_pre::PreNumber)
     # Express idoms in BB indexing
     resize!(domtree.idoms_bb, n_blocks)
     for i::BBNumber in 1:n_blocks
-        if i == 1 || D.to_pre[i] == 0
+        if (!IsPostDom && i == 1) || D.to_pre[i] == 0
             domtree.idoms_bb[i] = 0
         else
-            domtree.idoms_bb[i] = D.from_pre[idoms_pre[D.to_pre[i]]]
+            ip = idoms_pre[D.to_pre[i]]
+            domtree.idoms_bb[i] = ip == 0 ? 0 : D.from_pre[ip]
         end
     end
 end
@@ -549,7 +586,12 @@ Checks if `bb1` dominates `bb2`.
 `bb1` dominates `bb2` if the only way to enter `bb2` is via `bb1`.
 (Other blocks may be in between, e.g `bb1->bbx->bb2`).
 """
-function dominates(domtree::DomTree, bb1::BBNumber, bb2::BBNumber)
+dominates(domtree::DomTree, bb1::BBNumber, bb2::BBNumber) =
+    _dominates(domtree, bb1, bb2)
+postdominates(domtree::PostDomTree, bb1::BBNumber, bb2::BBNumber) =
+    _dominates(domtree, bb1, bb2)
+
+function _dominates(domtree::GenericDomTree, bb1::BBNumber, bb2::BBNumber)
     bb1 == bb2 && return true
     target_level = domtree.nodes[bb1].level
     source_level = domtree.nodes[bb2].level
@@ -584,19 +626,48 @@ function iterate(doms::DominatedBlocks, state::Nothing=nothing)
     return (bb, nothing)
 end
 
-function naive_idoms(blocks::Vector{BasicBlock})
+"""
+    nearest_common_dominator(domtree::GenericDomTree, a::BBNumber, b::BBNumber)
+
+Compute the nearest common (post-)dominator of `a` and `b`.
+"""
+function nearest_common_dominator(domtree::GenericDomTree, a::BBNumber, b::BBNumber)
+    alevel = domtree.nodes[a].level
+    blevel = domtree.nodes[b].level
+    # W.l.g. assume blevel <= alevel
+    if alevel < blevel
+        a, b = b, a
+        alevel, blevel = blevel, alevel
+    end
+    while alevel > blevel
+        a = domtree.idoms_bb[a]
+        alevel -= 1
+    end
+    while a != b && a != 0
+        a = domtree.idoms_bb[a]
+        b = domtree.idoms_bb[b]
+    end
+    @assert a == b
+    return a
+end
+
+function naive_idoms(blocks::Vector{BasicBlock}, is_post_dominator::Bool=false)
     nblocks = length(blocks)
     # The extra +1 helps us detect unreachable blocks below
     dom_all = BitSet(1:nblocks+1)
-    dominators = BitSet[n == 1 ? BitSet(1) : copy(dom_all) for n = 1:nblocks]
+    dominators = is_post_dominator ?
+        BitSet[isempty(blocks[n].succs) ? BitSet(n) : copy(dom_all) for n = 1:nblocks] :
+        BitSet[n == 1 ? BitSet(1) : copy(dom_all) for n = 1:nblocks]
     changed = true
+    relevant_blocks = (is_post_dominator ? (1:nblocks) : (2:nblocks))
     while changed
         changed = false
-        for n = 2:nblocks
-            if isempty(blocks[n].preds)
+        for n in relevant_blocks
+            edges = is_post_dominator ? blocks[n].succs : blocks[n].preds
+            if isempty(edges)
                 continue
             end
-            firstp, rest = Iterators.peel(Iterators.filter(p->p != 0, blocks[n].preds))::NTuple{2,Any}
+            firstp, rest = Iterators.peel(Iterators.filter(p->p != 0, edges))::NTuple{2,Any}
             new_doms = copy(dominators[firstp])
             for p in rest
                 intersect!(new_doms, dominators[p])
@@ -608,7 +679,7 @@ function naive_idoms(blocks::Vector{BasicBlock})
     end
     # Compute idoms
     idoms = fill(0, nblocks)
-    for i = 2:nblocks
+    for i in relevant_blocks
         if dominators[i] == dom_all
             idoms[i] = 0
             continue
diff --git a/base/compiler/ssair/passes.jl b/base/compiler/ssair/passes.jl
index 70ae94e611a1fb..00577526ac2eb2 100644
--- a/base/compiler/ssair/passes.jl
+++ b/base/compiler/ssair/passes.jl
@@ -595,16 +595,21 @@ function is_old(compact, @nospecialize(old_node_ssa))
         !already_inserted(compact, old_node_ssa)
 end
 
-mutable struct LazyDomtree
+mutable struct LazyGenericDomtree{IsPostDom}
     ir::IRCode
-    domtree::DomTree
-    LazyDomtree(ir::IRCode) = new(ir)
+    domtree::GenericDomTree{IsPostDom}
+    LazyGenericDomtree{IsPostDom}(ir::IRCode) where {IsPostDom} = new{IsPostDom}(ir)
 end
-function get!(x::LazyDomtree)
+function get!(x::LazyGenericDomtree{IsPostDom}) where {IsPostDom}
     isdefined(x, :domtree) && return x.domtree
-    return @timeit "domtree 2" x.domtree = construct_domtree(x.ir.cfg.blocks)
+    return @timeit "domtree 2" x.domtree = IsPostDom ?
+        construct_postdomtree(x.ir.cfg.blocks) :
+        construct_domtree(x.ir.cfg.blocks)
 end
 
+const LazyDomtree = LazyGenericDomtree{false}
+const LazyPostDomtree = LazyGenericDomtree{true}
+
 function perform_lifting!(compact::IncrementalCompact,
         visited_phinodes::Vector{AnySSAValue}, @nospecialize(cache_key),
         lifting_cache::IdDict{Pair{AnySSAValue, Any}, AnySSAValue},
@@ -1051,7 +1056,7 @@ end
 
 # NOTE we resolve the inlining source here as we don't want to serialize `Core.Compiler`
 # data structure into the global cache (see the comment in `handle_finalizer_call!`)
-function try_inline_finalizer!(ir::IRCode, argexprs::Vector{Any}, idx::Int, mi::MethodInstance, inlining::InliningState)
+function try_inline_finalizer!(ir::IRCode, argexprs::Vector{Any}, idx::Int, mi::MethodInstance, inlining::InliningState, attach_after::Bool)
     code = get(inlining.mi_cache, mi, nothing)
     et = InliningEdgeTracker(inlining.et)
     if code isa CodeInstance
@@ -1091,7 +1096,7 @@ function try_inline_finalizer!(ir::IRCode, argexprs::Vector{Any}, idx::Int, mi::
             ssa_rename[ssa.id]
         end
         stmt′ = ssa_substitute_op!(InsertBefore(ir, SSAValue(idx)), inst, stmt′, argexprs, mi.specTypes, mi.sparam_vals, sp_ssa, :default)
-        ssa_rename[idx′] = insert_node!(ir, idx, NewInstruction(stmt′, inst; line = inst[:line] + linetable_offset), true)
+        ssa_rename[idx′] = insert_node!(ir, idx, NewInstruction(stmt′, inst; line = inst[:line] + linetable_offset), attach_after)
     end
 
     return true
@@ -1099,39 +1104,112 @@ end
 
 is_nothrow(ir::IRCode, pc::Int) = (ir.stmts[pc][:flag] & IR_FLAG_NOTHROW) ≠ 0
 
-function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse::SSADefUse, inlining::InliningState, info::Union{FinalizerInfo, Nothing})
-    # For now: Require that all uses and defs are in the same basic block,
-    # so that live range calculations are easy.
-    bb = ir.cfg.blocks[block_for_inst(ir.cfg, first(defuse.uses).idx)]
-    minval::Int = typemax(Int)
-    maxval::Int = 0
+function reachable_blocks(cfg::CFG, from_bb, to_bb = nothing)
+    worklist = Int[]
+    visited = BitSet()
+    if to_bb !== nothing
+        push!(visited, to_bb)
+    end
+    function visit!(bb)
+        if !(bb in visited)
+            push!(visited, bb)
+            push!(worklist, bb)
+        end
+    end
+    visit!(from_bb)
+    while !isempty(worklist)
+        foreach(visit!, cfg.blocks[pop!(worklist)].succs)
+    end
+    return visited
+end
 
-    function check_in_range(x::Union{Int,SSAUse})
-        if isa(x, SSAUse)
-            didx = x.idx
+function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse::SSADefUse,
+        inlining::InliningState, lazydomtree::LazyDomtree,
+        lazypostdomtree::LazyPostDomtree, info::Union{FinalizerInfo, Nothing})
+    # For now, require that:
+    # 1. The allocation dominates the finalizer registration
+    # 2. The finalizer registration dominates all uses reachable from the
+    #    finalizer registration.
+    # 3. The insertion block for the finalizer is the post-dominator of all
+    #    uses and the finalizer registration block. The insertion block must
+    #    be dominated by the finalizer registration block.
+    # 4. The path from the finalizer registration to the finalizer inlining
+    #    location is nothrow
+    #
+    # TODO: We could relax item 3, by inlining the finalizer multiple times.
+
+    # Check #1: The allocation dominates the finalizer registration
+    domtree = get!(lazydomtree)
+    finalizer_bb = block_for_inst(ir, finalizer_idx)
+    alloc_bb = block_for_inst(ir, idx)
+    dominates(domtree, alloc_bb, finalizer_bb) || return nothing
+
+    bb_insert_block = finalizer_bb
+    bb_insert_idx = finalizer_idx
+    function note_block_use!(bb, idx)
+        new_bb_insert_block = nearest_common_dominator(get!(lazypostdomtree), bb_insert_block, bb)
+        bb_insert_idx = new_bb_insert_block == bb_insert_block ? max(idx, bb_insert_idx) : nothing
+        bb_insert_block = new_bb_insert_block
+        nothing
+    end
+
+    # Collect all reachable blocks between the finalizer registration and the
+    # insertion point
+    blocks = reachable_blocks(ir.cfg, finalizer_bb, alloc_bb)
+
+    # Check #2
+    all_in_finalizer_bb::Bool = true
+    maxval::Int = 0
+    function check_defuse(x::Union{Int,SSAUse})
+        duidx = x isa SSAUse ? x.idx : x
+        duidx == finalizer_idx && return true
+        bb = block_for_inst(ir, duidx)
+        # Not reachable from finalizer registration - we're ok
+        bb ∉ blocks && return true
+        note_block_use!(bb, idx)
+        if dominates(domtree, finalizer_bb, bb)
+            return true
         else
-            didx = x
-        end
-        didx in bb.stmts || return false
-        if didx < minval
-            minval = didx
+            @show duidx
+            return false
         end
-        if didx > maxval
-            maxval = didx
+    end
+    all(check_defuse, defuse.uses) || return nothing
+    all(check_defuse, defuse.defs) || return nothing
+
+    function check_range_nothrow(range)
+        all(finalizer_idx:maxval) do sidx::Int
+            r = is_nothrow(ir, idx) || sidx == finalizer_idx || sidx == idx
+            r || @show sidx
+            return r
         end
-        return true
     end
 
-    check_in_range(idx) || return nothing
-    all(check_in_range, defuse.uses) || return nothing
-    all(check_in_range, defuse.defs) || return nothing
+    # Check #3
+    dominates(domtree, finalizer_bb, bb_insert_block) || return nothing
+
+    # Collect all reachable blocks between the finalizer registration and the
+    # insertion point
+    blocks = reachable_blocks(ir.cfg, finalizer_bb, bb_insert_block)
+
+    # Check #4
+    for bb in blocks
+        range = ir.cfg.blocks[bb].stmts
+        if bb == bb_insert_block
+            bb_insert_idx === nothing && continue
+            range = first(range):bb_insert_idx
+        end
+        if bb == finalizer_bb
+            range = finalizer_idx:last(range)
+        end
+        check_range_nothrow(range) || return nothing
+    end
 
-    # For now: Require all statements in the basic block range to be nothrow.
-    all(minval:maxval) do sidx::Int
-        return is_nothrow(ir, idx) || sidx == finalizer_idx || sidx == idx
-    end || return nothing
+    # Ok, legality check complete. Figure out the exact statement where we're
+    # gonna inline the finalizer.
+    loc = bb_insert_idx === nothing ? first(ir.cfg.blocks[bb_insert_block].stmts) : bb_insert_idx
+    attach_after = bb_insert_idx !== nothing
 
-    # Ok, `finalizer` rewrite is legal.
     finalizer_stmt = ir[SSAValue(finalizer_idx)][:inst]
     argexprs = Any[finalizer_stmt.args[2], finalizer_stmt.args[3]]
     flags = info === nothing ? UInt8(0) : flags_for_effects(info.effects)
@@ -1141,14 +1219,14 @@ function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse
             # No code in the function - Nothing to do
         else
             mi = finalizer_stmt.args[5]::MethodInstance
-            if inline::Bool && try_inline_finalizer!(ir, argexprs, maxval, mi, inlining)
+            if inline::Bool && try_inline_finalizer!(ir, argexprs, loc, mi, inlining, attach_after)
                 # the finalizer body has been inlined
             else
-                insert_node!(ir, maxval, with_flags(NewInstruction(Expr(:invoke, mi, argexprs...), Nothing), flags), true)
+                insert_node!(ir, loc, with_flags(NewInstruction(Expr(:invoke, mi, argexprs...), Nothing), flags), attach_after)
             end
         end
     else
-        insert_node!(ir, maxval, with_flags(NewInstruction(Expr(:call, argexprs...), Nothing), flags), true)
+        insert_node!(ir, loc, with_flags(NewInstruction(Expr(:call, argexprs...), Nothing), flags), attach_after)
     end
     # Erase the call to `finalizer`
     ir[SSAValue(finalizer_idx)][:inst] = nothing
@@ -1156,6 +1234,7 @@ function try_resolve_finalizer!(ir::IRCode, idx::Int, finalizer_idx::Int, defuse
 end
 
 function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse}}, used_ssas::Vector{Int}, lazydomtree::LazyDomtree, inlining::Union{Nothing, InliningState})
+    lazypostdomtree = LazyPostDomtree(ir)
     for (idx, (intermediaries, defuse)) in defuses
         intermediaries = collect(intermediaries)
         # Check if there are any uses we did not account for. If so, the variable
@@ -1188,7 +1267,8 @@ function sroa_mutables!(ir::IRCode, defuses::IdDict{Int, Tuple{SPCSet, SSADefUse
             end
         end
         if finalizer_idx !== nothing && inlining !== nothing
-            try_resolve_finalizer!(ir, idx, finalizer_idx, defuse, inlining, ir[SSAValue(finalizer_idx)][:info])
+            try_resolve_finalizer!(ir, idx, finalizer_idx, defuse, inlining,
+                lazydomtree, lazypostdomtree, ir[SSAValue(finalizer_idx)][:info])
             continue
         end
         # Partition defuses by field
diff --git a/test/compiler/inline.jl b/test/compiler/inline.jl
index a70bfff62d5eea..72e4b34331edda 100644
--- a/test/compiler/inline.jl
+++ b/test/compiler/inline.jl
@@ -1380,6 +1380,14 @@ mutable struct DoAllocWithField
         end
     end
 end
+mutable struct DoAllocWithFieldInter
+    x::Int
+end
+function register_finalizer!(obj::DoAllocWithFieldInter)
+    finalizer(obj) do this
+        add_finalization_count!(this.x)
+    end
+end
 
 function const_finalization(io)
     for i = 1:1000
@@ -1409,6 +1417,108 @@ let src = code_typed1(useless_finalizer, ())
     @test length(src.code) == 2
 end
 
+# tests finalizer inlining when def/uses involve control flow
+function cfg_finalization1(io)
+    for i = -999:1000
+        o = DoAllocWithField(i)
+        if i == 1000
+            safeprint(io, o.x, '\n')
+        elseif i > 0
+            safeprint(io, o.x)
+        end
+    end
+end
+let src = code_typed1(cfg_finalization1, (IO,))
+    @test count(isinvoke(:add_finalization_count!), src.code) == 1
+end
+let
+    init_finalization_count!()
+    cfg_finalization1(IOBuffer())
+    @test get_finalization_count() == 1000
+end
+
+function cfg_finalization2(io)
+    for i = -999:1000
+        o = DoAllocWithField(1)
+        o.x = i # with `setfield!`
+        if i == 1000
+            safeprint(io, o.x, '\n')
+        elseif i > 0
+            safeprint(io, o.x)
+        end
+    end
+end
+let src = code_typed1(cfg_finalization2, (IO,))
+    @test count(isinvoke(:add_finalization_count!), src.code) == 1
+end
+let
+    init_finalization_count!()
+    cfg_finalization2(IOBuffer())
+    @test get_finalization_count() == 2000
+end
+
+function cfg_finalization3(io)
+    for i = -999:1000
+        o = DoAllocWithFieldInter(i)
+        register_finalizer!(o)
+        if i == 1000
+            safeprint(io, o.x, '\n')
+        elseif i > 0
+            safeprint(io, o.x)
+        end
+    end
+end
+let src = code_typed1(cfg_finalization3, (IO,))
+    @test count(isinvoke(:add_finalization_count!), src.code) == 1
+end
+let
+    init_finalization_count!()
+    cfg_finalization3(IOBuffer())
+    @test get_finalization_count() == 1000
+end
+
+function cfg_finalization4(io)
+    for i = -999:1000
+        o = DoAllocWithFieldInter(1)
+        o.x = i # with `setfield!`
+        register_finalizer!(o)
+        if i == 1000
+            safeprint(io, o.x, '\n')
+        elseif i > 0
+            safeprint(io, o.x)
+        end
+    end
+end
+let src = code_typed1(cfg_finalization4, (IO,))
+    @test count(isinvoke(:add_finalization_count!), src.code) == 1
+end
+let
+    init_finalization_count!()
+    cfg_finalization4(IOBuffer())
+    @test get_finalization_count() == 1000
+end
+
+function cfg_finalization5(io)
+    for i = -999:1000
+        o = DoAllocWithFieldInter(i)
+        if i == 1000
+            safeprint(io, o.x, '\n')
+        elseif i > 0
+            safeprint(io, o.x)
+        end
+        register_finalizer!(o)
+    end
+end
+let src = code_typed1(cfg_finalization5, (IO,))
+    # TODO we can fix this case by checking a case when a finalizer block is post-dominated by all the def/uses
+    @test count(isinvoke(:add_finalization_count!), src.code) == 1
+end
+let
+    init_finalization_count!()
+    cfg_finalization5(IOBuffer())
+    @test get_finalization_count() == 1000
+end
+
 # optimize `[push!|pushfirst!](::Vector{Any}, x...)`
 @testset "optimize `$f(::Vector{Any}, x...)`" for f = Any[push!, pushfirst!]
     @eval begin
diff --git a/test/compiler/irutils.jl b/test/compiler/irutils.jl
index b44a656ea7b341..76f883d6cea2c3 100644
--- a/test/compiler/irutils.jl
+++ b/test/compiler/irutils.jl
@@ -17,7 +17,12 @@ function iscall((src, f)::Tuple{IR,Base.Callable}, @nospecialize(x)) where IR<:U
         singleton_type(argextype(x, src)) === f
     end
 end
-iscall(pred::Base.Callable, @nospecialize(x)) = isexpr(x, :call) && pred(x.args[1])
+function iscall(pred::Base.Callable, @nospecialize(x))
+    if isexpr(x, :(=))
+        x = x.args[2]
+    end
+    return isexpr(x, :call) && pred(x.args[1])
+end
 
 # check if `x` is a statically-resolved call of a function whose name is `sym`
 isinvoke(y) = @nospecialize(x) -> isinvoke(y, x)
diff --git a/test/compiler/ssair.jl b/test/compiler/ssair.jl
index 1acd490a472954..2474c5994b52e7 100644
--- a/test/compiler/ssair.jl
+++ b/test/compiler/ssair.jl
@@ -69,8 +69,10 @@ let cfg = CFG(BasicBlock[
 ], Int[])
     dfs = Compiler.DFS(cfg.blocks)
     @test dfs.from_pre[dfs.to_parent_pre[dfs.to_pre[5]]] == 4
-    let correct_idoms = Compiler.naive_idoms(cfg.blocks)
+    let correct_idoms = Compiler.naive_idoms(cfg.blocks),
+        correct_pidoms = Compiler.naive_idoms(cfg.blocks, true)
         @test Compiler.construct_domtree(cfg.blocks).idoms_bb == correct_idoms
+        @test Compiler.construct_postdomtree(cfg.blocks).idoms_bb == correct_pidoms
         # For completeness, reverse the order of pred/succ in the CFG and verify
         # the answer doesn't change (it does change the which node is chosen
         # as the semi-dominator, since it changes the DFS numbering).
@@ -82,6 +84,7 @@ let cfg = CFG(BasicBlock[
                 d && (blocks[5] = make_bb(reverse(blocks[5].preds), blocks[5].succs))
                 cfg′ = CFG(blocks, cfg.index)
                 @test Compiler.construct_domtree(cfg′.blocks).idoms_bb == correct_idoms
+                @test Compiler.construct_postdomtree(cfg′.blocks).idoms_bb == correct_pidoms
             end
         end
     end