From 82b833e512e0de19ca9e516de7e6af58d1156c74 Mon Sep 17 00:00:00 2001 From: Chris Fallin Date: Wed, 22 Jan 2025 15:52:12 -0800 Subject: [PATCH] Cranelift/x64 backend: do not use one-way branches. In #9980, we saw that code copmiled with the single-pass register allocator has incorrect behavior. We eventually narrowed this down to the fact that the single-pass allocator is inserting code meant to be at the end of a block, just before its terminator, *between* two branches that form the terminator sequence. The allocator is correct; the bug is with Cranelift's x64 backend. When we produce instructions into a VCode container, we maintain basic blocks, and we have the invariant (usual for basic block-based IR) that only the last -- terminator -- instruction is a branch that can leave the block. Even the conditional branches maintain this invariant: though VCode is meant to be "almost machine code", we emit *two-target conditionals* that are semantically like "jcond; jmp". We then are able to optimize this inline during binary emission in the `MachBuffer`: the buffer knows about unconditional and conditional branches and will "chomp" branches off the tail of the buffer whenever they target the fallthrough block. (We designed the system this way because it is simpler to think about BBs that are order-invariant, i.e., not bake the "fallthrough" concept into the IR.) Thus we have a simpler abstraction but produce optimal terminator sequences. Unfortunately, when adding a branch-on-floating-point-compare lowering, we had the need to branch to a target if either of *two* conditions were true, and rather than add a new kind of terminator instruction, we added a "one-armed branch": conditionally branch to label or fall through. We emitted this in sequence right before the actual terminator, so semantically it was almost equivalent. I write "almost" because the register allocator *is* allowed to insert spills/reloads/moves between any two instructions. Here the distinct pieces of the terminator sequence matter: the allocator might insert something just before the last instruction, assuming the basic-block "single in, single out" invariant means this will always run with the block. With one-armed branches this is no longer true. The backtracking allocator (our original RA2 algorithm, and still the default today) will never insert code at the end of a block when it has multiple terminators, because it associates such block-start/end insertions with *edges*; so in such conditions it inserts instructions into the tops of successor blocks instead. But the single-pass allocator needs to perform work at the end of every block, so it will trigger this bug. This PR removes `JmpIf` and converts the br-of-fcmp lowering to use `JmpCondOr` instead, which is a pseudoinstruction that does `jcc1; jcc2; jmp`. This maintains the BB invariant and fixes the bug. Note that Winch still uses `JmpIf`, so we cannot remove it entirely: this PR renames it to `WinchJmpIf` instead, and adds a mechanism to assert failure if it is ever added to `VCode` (rather than emitted directly, as Winch's macro-assembler does). We could instead write Winch's `jmp_if` assembler function in terms of `JmpCond` with a fallthrough label that is immediately bound, and let the MachBuffer always chomp the jmp; I opted not to regress Winch compiler performance by doing this. If one day we abstract out the assembler further, we can remove `WinchJmpIf`. This is one of two instances of a "one-armed branch"; the other is s390x's `OneWayCondBr`, used in `br_table` lowerings, which we will address separately. Once we do, that will address #9980 entirely. --- cranelift/codegen/src/isa/x64/inst.isle | 49 +++++++----- cranelift/codegen/src/isa/x64/inst/emit.rs | 75 ++++++++++++++++++- cranelift/codegen/src/isa/x64/inst/mod.rs | 28 ++++++- cranelift/codegen/src/isa/x64/pcc.rs | 3 +- cranelift/codegen/src/machinst/mod.rs | 8 ++ cranelift/codegen/src/machinst/vcode.rs | 1 + .../filetests/filetests/isa/x64/branches.clif | 18 ++--- .../filetests/isa/x64/f64-branches-avx.clif | 6 +- ...arefully-sink-loads-in-float-compares.clif | 24 ++---- winch/codegen/src/isa/x64/asm.rs | 2 +- 10 files changed, 155 insertions(+), 59 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 393174e48e73..674298af891c 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -578,17 +578,16 @@ ;; Jump to a known target: jmp simm32. (JmpKnown (dst MachLabel)) - ;; One-way conditional branch: jcond cond target. + ;; Low-level one-way conditional branch: jcond cond target. ;; - ;; This instruction is useful when we have conditional jumps depending on - ;; more than two conditions, see for instance the lowering of Brif - ;; with Fcmp inputs. - ;; - ;; A note of caution: in contexts where the branch target is another - ;; block, this has to be the same successor as the one specified in the - ;; terminator branch of the current block. Otherwise, this might confuse - ;; register allocation by creating new invisible edges. - (JmpIf (cc CC) + ;; This instruction is useful only for lower-level code + ;; generators that use the Cranelift instruction backend as an + ;; assembler library. The instruction is thus named after its + ;; primary user, Winch. This instruction *should not* be used + ;; by Cranelift proper and placed into VCode: it does not + ;; adhere to the basic-block invariant, namely that branches + ;; always end a block (with no fallthrough). + (WinchJmpIf (cc CC) (taken MachLabel)) ;; Two-way conditional branch: jcond cond target target. @@ -599,6 +598,17 @@ (taken MachLabel) (not_taken MachLabel)) + ;; Two-way conditional branch with a combination of conditions: + ;; + ;; j(cc1 or cc2) target1 target2 + ;; + ;; Emitted as a compound sequence of three branches -- `jcc1 + ;; target1`, `jcc2 target1`, `jmp target2`. + (JmpCondOr (cc1 CC) + (cc2 CC) + (taken MachLabel) + (not_taken MachLabel)) + ;; Jump-table sequence, as one compound instruction (see note in lower.rs ;; for rationale). ;; @@ -5030,15 +5040,16 @@ (rule (jmp_known target) (SideEffectNoResult.Inst (MInst.JmpKnown target))) -(decl jmp_if (CC MachLabel) ConsumesFlags) -(rule (jmp_if cc taken) - (ConsumesFlags.ConsumesFlagsSideEffect (MInst.JmpIf cc taken))) - ;; Conditional jump based on the condition code. (decl jmp_cond (CC MachLabel MachLabel) ConsumesFlags) (rule (jmp_cond cc taken not_taken) (ConsumesFlags.ConsumesFlagsSideEffect (MInst.JmpCond cc taken not_taken))) +;; Conditional jump based on the OR of two condition codes. +(decl jmp_cond_or (CC CC MachLabel MachLabel) ConsumesFlags) +(rule (jmp_cond_or cc1 cc2 taken not_taken) + (ConsumesFlags.ConsumesFlagsSideEffect (MInst.JmpCondOr cc1 cc2 taken not_taken))) + ;; Conditional jump based on the result of an icmp. (decl jmp_cond_icmp (IcmpCondResult MachLabel MachLabel) SideEffectNoResult) (rule (jmp_cond_icmp (IcmpCondResult.Condition producer cc) taken not_taken) @@ -5050,14 +5061,12 @@ (with_flags_side_effect producer (jmp_cond cc taken not_taken))) (rule (jmp_cond_fcmp (FcmpCondResult.AndCondition producer cc1 cc2) taken not_taken) (with_flags_side_effect producer - (consumes_flags_concat - (jmp_if (cc_invert cc1) not_taken) - (jmp_cond (cc_invert cc2) not_taken taken)))) + ;; DeMorgan's rule: to get cc1 AND cc2, we do NOT (NOT cc1 OR NOT cc2). + ;; The outer NOT comes from flipping `not_taken` and `taken`. + (jmp_cond_or (cc_invert cc1) (cc_invert cc2) not_taken taken))) (rule (jmp_cond_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) taken not_taken) (with_flags_side_effect producer - (consumes_flags_concat - (jmp_if cc1 taken) - (jmp_cond cc2 taken not_taken)))) + (jmp_cond_or cc1 cc2 taken not_taken))) ;; Emit the compound instruction that does: ;; diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 2a8ce8d7ff15..3ba5d3e7dd99 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1622,7 +1622,8 @@ pub(crate) fn emit( inst.emit(sink, info, state); // jne .loop_start - // TODO: Encoding the JmpIf as a short jump saves us 4 bytes here. + // TODO: Encoding the conditional jump as a short jump + // could save us us 4 bytes here. one_way_jmp(sink, CC::NZ, loop_start); // The regular prologue code is going to emit a `sub` after this, so we need to @@ -1891,7 +1892,7 @@ pub(crate) fn emit( sink.put4(0x0); } - Inst::JmpIf { cc, taken } => { + Inst::WinchJmpIf { cc, taken } => { let cond_start = sink.cur_offset(); let cond_disp_off = cond_start + 2; @@ -1936,6 +1937,76 @@ pub(crate) fn emit( sink.put4(0x0); } + Inst::JmpCondOr { + cc1, + cc2, + taken, + not_taken, + } => { + // Emit: + // jcc1 taken + // jcc2 taken + // jmp not_taken + // + // Note that we enroll both conditionals in the + // branch-chomping mechanism because MachBuffer + // simplification can continue upward as long as it keeps + // chomping branches. In the best case, if taken == + // not_taken and that one block is the fallthrough block, + // all three branches can disappear. + + // jcc1 taken + let cond_1_start = sink.cur_offset(); + let cond_1_disp_off = cond_1_start + 2; + let cond_1_end = cond_1_start + 6; + + sink.use_label_at_offset(cond_1_disp_off, *taken, LabelUse::JmpRel32); + let inverted: [u8; 6] = [ + 0x0F, + 0x80 + (cc1.invert().get_enc()), + 0x00, + 0x00, + 0x00, + 0x00, + ]; + sink.add_cond_branch(cond_1_start, cond_1_end, *taken, &inverted[..]); + + sink.put1(0x0F); + sink.put1(0x80 + cc1.get_enc()); + sink.put4(0x0); + + // jcc2 taken + let cond_2_start = sink.cur_offset(); + let cond_2_disp_off = cond_2_start + 2; + let cond_2_end = cond_2_start + 6; + + sink.use_label_at_offset(cond_2_disp_off, *taken, LabelUse::JmpRel32); + let inverted: [u8; 6] = [ + 0x0F, + 0x80 + (cc2.invert().get_enc()), + 0x00, + 0x00, + 0x00, + 0x00, + ]; + sink.add_cond_branch(cond_2_start, cond_2_end, *taken, &inverted[..]); + + sink.put1(0x0F); + sink.put1(0x80 + cc2.get_enc()); + sink.put4(0x0); + + // jmp not_taken + let uncond_start = sink.cur_offset(); + let uncond_disp_off = uncond_start + 1; + let uncond_end = uncond_start + 5; + + sink.use_label_at_offset(uncond_disp_off, *not_taken, LabelUse::JmpRel32); + sink.add_uncond_branch(uncond_start, uncond_end, *not_taken); + + sink.put1(0xE9); + sink.put4(0x0); + } + Inst::JmpUnknown { target } => { let target = target.clone(); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index ea69f97e7a60..2782a8632b6a 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -93,7 +93,8 @@ impl Inst { | Inst::Hlt | Inst::Imm { .. } | Inst::JmpCond { .. } - | Inst::JmpIf { .. } + | Inst::JmpCondOr { .. } + | Inst::WinchJmpIf { .. } | Inst::JmpKnown { .. } | Inst::JmpTableSeq { .. } | Inst::JmpUnknown { .. } @@ -1738,12 +1739,24 @@ impl PrettyPrint for Inst { format!("{op} {dst}") } - Inst::JmpIf { cc, taken } => { + Inst::WinchJmpIf { cc, taken } => { let taken = taken.to_string(); let op = ljustify2("j".to_string(), cc.to_string()); format!("{op} {taken}") } + Inst::JmpCondOr { + cc1, + cc2, + taken, + not_taken, + } => { + let taken = taken.to_string(); + let not_taken = not_taken.to_string(); + let op = ljustify(format!("j{},{}", cc1, cc2)); + format!("{op} {taken}; j {not_taken}") + } + Inst::JmpCond { cc, taken, @@ -2657,8 +2670,9 @@ fn x64_get_operands(inst: &mut Inst, collector: &mut impl OperandVisitor) { } Inst::JmpKnown { .. } - | Inst::JmpIf { .. } + | Inst::WinchJmpIf { .. } | Inst::JmpCond { .. } + | Inst::JmpCondOr { .. } | Inst::Ret { .. } | Inst::Nop { .. } | Inst::TrapIf { .. } @@ -2775,12 +2789,20 @@ impl MachInst for Inst { } &Self::JmpKnown { .. } => MachTerminator::Uncond, &Self::JmpCond { .. } => MachTerminator::Cond, + &Self::JmpCondOr { .. } => MachTerminator::Cond, &Self::JmpTableSeq { .. } => MachTerminator::Indirect, // All other cases are boring. _ => MachTerminator::None, } } + fn is_low_level_branch(&self) -> bool { + match self { + &Self::WinchJmpIf { .. } => true, + _ => false, + } + } + fn is_mem_access(&self) -> bool { panic!("TODO FILL ME OUT") } diff --git a/cranelift/codegen/src/isa/x64/pcc.rs b/cranelift/codegen/src/isa/x64/pcc.rs index b3595ca7a06d..6d5ddce3e2d8 100644 --- a/cranelift/codegen/src/isa/x64/pcc.rs +++ b/cranelift/codegen/src/isa/x64/pcc.rs @@ -840,8 +840,9 @@ pub(crate) fn check( | Inst::ReturnCallKnown { .. } | Inst::JmpKnown { .. } | Inst::Ret { .. } - | Inst::JmpIf { .. } + | Inst::WinchJmpIf { .. } | Inst::JmpCond { .. } + | Inst::JmpCondOr { .. } | Inst::TrapIf { .. } | Inst::TrapIfAnd { .. } | Inst::TrapIfOr { .. } diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs index ed07b1bf0365..8ffc68b4e81f 100644 --- a/cranelift/codegen/src/machinst/mod.rs +++ b/cranelift/codegen/src/machinst/mod.rs @@ -201,6 +201,14 @@ pub trait MachInst: Clone + Debug { /// architecture. fn function_alignment() -> FunctionAlignment; + /// Is this a low-level, one-way branch, not meant for use in a + /// VCode body? These instructions are meant to be used only when + /// directly emitted, i.e. when `MachInst` is used as an assembler + /// library. + fn is_low_level_branch(&self) -> bool { + false + } + /// A label-use kind: a type that describes the types of label references that /// can occur in an instruction. type LabelUse: MachInstLabelUse; diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs index 8f33118f8e4b..c026d723e78e 100644 --- a/cranelift/codegen/src/machinst/vcode.rs +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -357,6 +357,7 @@ impl VCodeBuilder { /// Push an instruction for the current BB and current IR inst /// within the BB. pub fn push(&mut self, insn: I, loc: RelSourceLoc) { + assert!(!insn.is_low_level_branch()); // These are not meant to be in VCode. self.vcode.insts.push(insn); self.vcode.srclocs.push(loc); } diff --git a/cranelift/filetests/filetests/isa/x64/branches.clif b/cranelift/filetests/filetests/isa/x64/branches.clif index fbbd876b8e98..1e3b7279c9fd 100644 --- a/cranelift/filetests/filetests/isa/x64/branches.clif +++ b/cranelift/filetests/filetests/isa/x64/branches.clif @@ -167,8 +167,7 @@ block2: ; movq %rsp, %rbp ; block0: ; ucomiss %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $2, %eax ; movq %rbp, %rsp @@ -216,8 +215,7 @@ block2: ; movq %rsp, %rbp ; block0: ; ucomiss %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $1, %eax ; movq %rbp, %rsp @@ -265,8 +263,7 @@ block2: ; movq %rsp, %rbp ; block0: ; ucomiss %xmm1, %xmm0 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; movl $1, %eax ; movq %rbp, %rsp @@ -631,8 +628,7 @@ block202: ; movq %rsp, %rbp ; block0: ; ucomiss const(1), %xmm0 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; jmp label5 ; block2: @@ -753,8 +749,7 @@ block2: ; movq %rsp, %rbp ; block0: ; ucomiss %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $2, %eax ; movq %rbp, %rsp @@ -855,8 +850,7 @@ block2: ; movq %rsp, %rbp ; block0: ; ucomiss %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $2, %eax ; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/isa/x64/f64-branches-avx.clif b/cranelift/filetests/filetests/isa/x64/f64-branches-avx.clif index f40100bbfa97..48246f934f21 100644 --- a/cranelift/filetests/filetests/isa/x64/f64-branches-avx.clif +++ b/cranelift/filetests/filetests/isa/x64/f64-branches-avx.clif @@ -20,8 +20,7 @@ block2: ; movq %rsp, %rbp ; block0: ; vucomiss %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $2, %eax ; movq %rbp, %rsp @@ -71,8 +70,7 @@ block2: ; movq %rsp, %rbp ; block0: ; vucomisd %xmm1, %xmm0 -; jp label1 -; jnz label1; j label2 +; jp,nz label1; j label2 ; block1: ; movl $2, %eax ; movq %rbp, %rsp diff --git a/cranelift/filetests/filetests/isa/x64/very-carefully-sink-loads-in-float-compares.clif b/cranelift/filetests/filetests/isa/x64/very-carefully-sink-loads-in-float-compares.clif index 01c9d4db2344..2e0769262347 100644 --- a/cranelift/filetests/filetests/isa/x64/very-carefully-sink-loads-in-float-compares.clif +++ b/cranelift/filetests/filetests/isa/x64/very-carefully-sink-loads-in-float-compares.clif @@ -186,8 +186,7 @@ block1(v8: i32): ; block0: ; movss 0(%rdi), %xmm1 ; ucomiss %xmm1, %xmm0 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; movq %rsi, %rax ; jmp label3 @@ -233,8 +232,7 @@ block1(v8: i32): ; block0: ; movss 0(%rdi), %xmm1 ; ucomiss %xmm0, %xmm1 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; movq %rsi, %rax ; jmp label3 @@ -282,8 +280,7 @@ block2(v7: i32): ; block0: ; movss 0(%rdi), %xmm3 ; ucomiss %xmm3, %xmm0 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; movq %rsi, %rax ; jmp label3 @@ -292,8 +289,7 @@ block2(v7: i32): ; jmp label3 ; block3: ; ucomiss %xmm3, %xmm0 -; jp label5 -; jnz label5; j label4 +; jp,nz label5; j label4 ; block4: ; jmp label6 ; block5: @@ -346,8 +342,7 @@ block2(v7: i32): ; block0: ; movss 0(%rdi), %xmm3 ; ucomiss %xmm0, %xmm3 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; movq %rsi, %rax ; jmp label3 @@ -356,8 +351,7 @@ block2(v7: i32): ; jmp label3 ; block3: ; ucomiss %xmm0, %xmm3 -; jp label5 -; jnz label5; j label4 +; jp,nz label5; j label4 ; block4: ; jmp label6 ; block5: @@ -413,8 +407,7 @@ block1(v7: i32): ; cmovpl %edx, %eax, %eax ; cmovnzl %edx, %eax, %eax ; ucomiss %xmm2, %xmm0 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; jmp label3 ; block2: @@ -465,8 +458,7 @@ block1(v7: i32): ; cmovpl %edx, %eax, %eax ; cmovnzl %edx, %eax, %eax ; ucomiss %xmm0, %xmm2 -; jp label2 -; jnz label2; j label1 +; jp,nz label2; j label1 ; block1: ; jmp label3 ; block2: diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 89e21cd60b89..92a5958d23a8 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -1537,7 +1537,7 @@ impl Assembler { /// Emits a conditional jump to the given label. pub fn jmp_if(&mut self, cc: impl Into, taken: MachLabel) { - self.emit(Inst::JmpIf { + self.emit(Inst::WinchJmpIf { cc: cc.into(), taken, });