diff --git a/tests/c/nested_sidetrace.c b/tests/c/nested_sidetrace.c index 2aa078c7b..cdcc71bd4 100644 --- a/tests/c/nested_sidetrace.c +++ b/tests/c/nested_sidetrace.c @@ -32,16 +32,17 @@ // 18 // yk-jit-event: enter-jit-code // yk-jit-event: deoptimise -// yk-jit-event: start-side-tracing // 20 +// yk-jit-event: enter-jit-code +// yk-jit-event: deoptimise +// yk-jit-event: start-side-tracing +// 22 // yk-jit-event: stop-tracing // --- Begin jit-pre-opt --- // ... // --- End jit-pre-opt --- -// 22 -// yk-jit-event: enter-jit-code -// yk-jit-event: execute-side-trace // 24 +// yk-jit-event: enter-jit-code // yk-jit-event: execute-side-trace // 26 // yk-jit-event: execute-side-trace diff --git a/tests/c/side-trace.c b/tests/c/side-trace.c index 230f573ca..a5ab0d3d4 100644 --- a/tests/c/side-trace.c +++ b/tests/c/side-trace.c @@ -87,7 +87,7 @@ int foo(int i) { int main(int argc, char **argv) { YkMT *mt = yk_mt_new(NULL); yk_mt_hot_threshold_set(mt, 0); - yk_mt_sidetrace_threshold_set(mt, 5); + yk_mt_sidetrace_threshold_set(mt, 4); YkLocation loc = yk_location_new(); int res = 0; diff --git a/tests/c/simple_peeling.c b/tests/c/simple_peeling.c new file mode 100644 index 000000000..e962820c5 --- /dev/null +++ b/tests/c/simple_peeling.c @@ -0,0 +1,49 @@ +// Run-time: +// env-var: YKD_LOG_IR=jit-post-opt +// env-var: YKD_SERIALISE_COMPILATION=1 +// env-var: YK_LOG=4 +// stderr: +// ... +// --- Begin jit-post-opt --- +// ... +// header_start ... +// ... +// header_end [%{{0}}, %{{1}}, %{{2}}, %{{3}}, %{{4}}, %{{25}}] +// ... +// body_start [%{{30}}, %{{31}}, %{{32}}, %{{33}}, %{{34}}, %{{35}}] +// ... +// body_end ... +// ... +// --- End jit-post-opt --- +// ... + +// Check that basic trace compilation works. + +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) { + YkMT *mt = yk_mt_new(NULL); + yk_mt_hot_threshold_set(mt, 0); + YkLocation loc = yk_location_new(); + + int res = 9998; + int i = 4; + NOOPT_VAL(loc); + NOOPT_VAL(res); + NOOPT_VAL(i); + while (i > 0) { + yk_mt_control_point(mt, &loc); + fprintf(stderr, "%d\n", i); + i--; + } + fprintf(stderr, "exit\n"); + NOOPT_VAL(res); + yk_location_drop(loc); + yk_mt_shutdown(mt); + return (EXIT_SUCCESS); +} diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs b/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs index 84dfc4128..42ba7bdc6 100644 --- a/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs +++ b/ykrt/src/compile/jitc_yk/codegen/x64/lsregalloc.rs @@ -175,6 +175,26 @@ impl<'a> LSRegAlloc<'a> { } } + /// Reset the register allocator. We use this when moving from the trace header into the trace + /// body. + pub(crate) fn reset(&mut self) { + for rs in self.gp_reg_states.iter_mut() { + *rs = RegState::Empty; + } + for reg in RESERVED_GP_REGS { + self.gp_reg_states[usize::from(reg.code())] = RegState::Reserved; + } + self.gp_regset = RegSet::with_gp_reserved(); + + for rs in self.fp_reg_states.iter_mut() { + *rs = RegState::Empty; + } + for reg in RESERVED_FP_REGS { + self.fp_reg_states[usize::from(reg.code())] = RegState::Reserved; + } + self.fp_regset = RegSet::with_fp_reserved(); + } + /// Before generating code for the instruction at `iidx`, see which registers are no longer /// needed and mark them as [RegState::Empty]. Calling this allows the register allocator to /// use the set of available registers more efficiently. @@ -379,7 +399,8 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::OutputFromReg(_) | RegConstraint::InputOutput(_) => true, - RegConstraint::Clobber(_) | RegConstraint::Temporary => false, + RegConstraint::Clobber(_) | RegConstraint::Temporary | RegConstraint::None => + false, }) .count() <= 1 @@ -409,7 +430,8 @@ impl LSRegAlloc<'_> { | RegConstraint::Output | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::Input(_) - | RegConstraint::Temporary => {} + | RegConstraint::Temporary + | RegConstraint::None => {} } } @@ -506,6 +528,9 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::OutputFromReg(_) | RegConstraint::Temporary => (), + RegConstraint::None => { + asgn[i] = Some(GP_REGS[0]); + } } } @@ -595,7 +620,8 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::OutputFromReg(_) | RegConstraint::Clobber(_) - | RegConstraint::Temporary => (), + | RegConstraint::Temporary + | RegConstraint::None => (), } } @@ -641,6 +667,7 @@ impl LSRegAlloc<'_> { self.gp_regset.unset(reg); self.gp_reg_states[usize::from(reg.code())] = RegState::Empty; } + RegConstraint::None => (), } } asgn.map(|x| x.unwrap()) @@ -891,14 +918,17 @@ impl LSRegAlloc<'_> { match inst { Inst::Copy(_) => panic!(), Inst::Const(cidx) => match self.m.const_(cidx) { - Const::Float(_, _) => todo!(), + Const::Float(_, v) => VarLocation::ConstFloat(*v), Const::Int(tyidx, v) => { let Ty::Integer(bits) = self.m.type_(*tyidx) else { panic!() }; VarLocation::ConstInt { bits: *bits, v: *v } } - Const::Ptr(_) => todo!(), + Const::Ptr(p) => VarLocation::ConstInt { + bits: 64, + v: u64::try_from(*p).unwrap(), + }, }, _ => match self.spills[usize::from(iidx)] { SpillState::Empty => panic!(), @@ -946,7 +976,8 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::OutputFromReg(_) | RegConstraint::InputOutput(_) => true, - RegConstraint::Clobber(_) | RegConstraint::Temporary => false, + RegConstraint::Clobber(_) | RegConstraint::Temporary | RegConstraint::None => + false, }) .count() <= 1 @@ -977,6 +1008,9 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputCanBeSameAsInput(_) | RegConstraint::Output | RegConstraint::Temporary => {} + RegConstraint::None => { + asgn[i] = Some(FP_REGS[0]); + } } } @@ -1023,6 +1057,7 @@ impl LSRegAlloc<'_> { | RegConstraint::OutputFromReg(_) | RegConstraint::Temporary => (), RegConstraint::OutputCanBeSameAsInput(_) => todo!(), + RegConstraint::None => (), } } @@ -1113,6 +1148,7 @@ impl LSRegAlloc<'_> { | RegConstraint::Clobber(_) | RegConstraint::Temporary => (), RegConstraint::OutputCanBeSameAsInput(_) => todo!(), + RegConstraint::None => (), } } @@ -1157,6 +1193,7 @@ impl LSRegAlloc<'_> { self.fp_reg_states[usize::from(reg.code())] = RegState::Empty; } RegConstraint::OutputCanBeSameAsInput(_) => todo!(), + RegConstraint::None => (), } } asgn.map(|x| x.unwrap()) @@ -1400,6 +1437,9 @@ pub(crate) enum RegConstraint { Clobber(R), /// A temporary register *x* that the instruction will clobber. Temporary, + /// A no-op register constraint. A random register will be assigned to this: using this + /// register for any purposes leads to undefined behaviour. + None, } #[cfg(debug_assertions)] @@ -1416,7 +1456,8 @@ impl RegConstraint { | Self::OutputCanBeSameAsInput(_) | Self::OutputFromReg(_) | Self::Clobber(_) - | Self::Temporary => None, + | Self::Temporary + | Self::None => None, } } } diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs index 3f9dbbdac..c78281593 100644 --- a/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs +++ b/ykrt/src/compile/jitc_yk/codegen/x64/mod.rs @@ -174,8 +174,10 @@ impl X64CodeGen { struct Assemble<'a> { m: &'a jit_ir::Module, ra: LSRegAlloc<'a>, - /// The locations of the live variables at the beginning of the loop. - loop_start_locs: Vec, + /// The locations of the live variables at the begining of the trace header. + header_start_locs: Vec, + /// The locations of the live variables at the beginning of the trace body. + body_start_locs: Vec, asm: dynasmrt::x64::Assembler, /// Deopt info, with one entry per guard, in the order that the guards appear in the trace. deoptinfo: HashMap, @@ -199,6 +201,12 @@ struct Assemble<'a> { /// layer of dead-code elimination: it doesn't cause JIT IR instructions to be removed, but /// it will stop any code being (directly) generated for some of them. used_insts: Vob, + /// The offset after the trace's prologue. This is the re-entry point when returning from + /// side-traces. + prologue_offset: AssemblyOffset, + /// Whether or not to skip processing [Param]s. We enable this once we've finished processing + /// the header, as we currently [Param]s in the trace body are only placeholders. + skip_params: bool, } impl<'a> Assemble<'a> { @@ -252,13 +260,16 @@ impl<'a> Assemble<'a> { m, ra: LSRegAlloc::new(m, inst_vals_alive_until, vloc_hints, sp_offset), asm, - loop_start_locs: Vec::new(), + header_start_locs: Vec::new(), + body_start_locs: Vec::new(), deoptinfo: HashMap::new(), comments: Cell::new(IndexMap::new()), sp_offset, root_offset, used_insts, ptradds, + prologue_offset: AssemblyOffset(0), + skip_params: false, })) } @@ -287,10 +298,6 @@ impl<'a> Assemble<'a> { } let alloc_off = self.emit_prologue(); - // The instruction offset after we've emitted the prologue (i.e. updated the stack - // pointer). We will later adjust this offset to also include one iteration of the trace - // so we can jump directly to the peeled loop. - let prologue_offset = self.asm.offset(); self.cg_insts()?; @@ -449,8 +456,8 @@ impl<'a> Assemble<'a> { deoptinfo: self.deoptinfo, prevguards, sp_offset: self.ra.stack_size(), - prologue_offset: prologue_offset.0, - entry_vars: self.loop_start_locs.clone(), + prologue_offset: self.prologue_offset.0, + entry_vars: self.header_start_locs.clone(), hl: Arc::downgrade(&hl), comments: self.comments.take(), #[cfg(any(debug_assertions, test))] @@ -507,9 +514,11 @@ impl<'a> Assemble<'a> { continue; } jit_ir::Inst::Guard(i) => self.cg_guard(iidx, i), - jit_ir::Inst::TraceLoopStart => self.cg_traceloopstart(), - jit_ir::Inst::TraceLoopJump => self.cg_traceloopjump(), - jit_ir::Inst::RootJump => self.cg_rootjump(self.m.root_jump_addr()), + jit_ir::Inst::TraceHeaderStart => self.cg_header_start(), + jit_ir::Inst::TraceHeaderEnd => self.cg_header_end(), + jit_ir::Inst::TraceBodyStart => self.cg_body_start(), + jit_ir::Inst::TraceBodyEnd => self.cg_body_end(iidx), + jit_ir::Inst::SidetraceEnd => self.cg_sidetrace_end(iidx, self.m.root_jump_addr()), jit_ir::Inst::SExt(i) => self.cg_sext(iidx, i), jit_ir::Inst::ZExt(i) => self.cg_zext(iidx, i), jit_ir::Inst::BitCast(i) => self.cg_bitcast(iidx, i), @@ -1063,6 +1072,9 @@ impl<'a> Assemble<'a> { /// Codegen a [jit_ir::ParamInst]. This only informs the register allocator about the /// locations of live variables without generating any actual machine code. fn cg_param(&mut self, iidx: jit_ir::InstIdx, inst: &jit_ir::ParamInst) { + if self.skip_params { + return; + } let m = VarLocation::from_yksmp_location(self.m, iidx, self.m.param(inst.paramidx())); debug_assert!(self.m.inst(iidx).def_byte_size(self.m) <= REG64_BYTESIZE); match m { @@ -1776,20 +1788,27 @@ impl<'a> Assemble<'a> { /// # Arguments /// /// * `tgt_vars` - The target locations. If `None` use `self.loop_start_locs` instead. - fn write_jump_vars(&mut self, tgt_vars: Option<&[VarLocation]>) { + fn write_jump_vars(&mut self, iidx: InstIdx, is_sidetrace: bool) { + let (tgt_vars, src_ops) = if is_sidetrace { + // Side-traces don't have a body and store these variables in `trace_header_end`. + (self.m.root_entry_vars(), self.m.trace_header_end()) + } else { + (self.body_start_locs.as_slice(), self.m.trace_body_end()) + }; // If we pass in `None` use `self.loop_start_locs` instead. We need to do this since we // can't pass in `&self.loop_start_locs` directly due to borrowing restrictions. - let tgt_vars = tgt_vars.unwrap_or(self.loop_start_locs.as_slice()); - for (i, op) in self.m.loop_jump_operands().iter().enumerate() { + let mut gp_regs = lsregalloc::GP_REGS + .iter() + .map(|_| RegConstraint::None) + .collect::>(); + let mut fp_regs = lsregalloc::FP_REGS + .iter() + .map(|_| RegConstraint::None) + .collect::>(); + for (i, op) in src_ops.iter().enumerate() { // FIXME: This is completely broken: see the FIXME later. let op = op.unpack(self.m); - let (iidx, src) = match op { - Operand::Var(iidx) => (iidx, self.op_to_var_location(op.clone())), - Operand::Const(_) => ( - InstIdx::try_from(0).unwrap(), - self.op_to_var_location(op.clone()), - ), - }; + let src = self.op_to_var_location(op.clone()); let dst = tgt_vars[i]; if dst == src { // The value is already in the correct place, so there's nothing we need to @@ -1806,6 +1825,9 @@ impl<'a> Assemble<'a> { 8 => dynasm!(self.asm; mov QWORD [rbp - i32::try_from(off_dst).unwrap()], Rq(reg.code()) ), + 4 => dynasm!(self.asm; + mov DWORD [rbp - i32::try_from(off_dst).unwrap()], Rd(reg.code()) + ), _ => todo!(), }, VarLocation::ConstInt { bits, v } => match bits { @@ -1826,7 +1848,13 @@ impl<'a> Assemble<'a> { mov QWORD [rbp - i32::try_from(off_dst).unwrap()], rax; pop rax ), - _ => todo!(), + 4 => dynasm!(self.asm; + push rax; + mov eax, DWORD [rbp - i32::try_from(off_src).unwrap()]; + mov DWORD [rbp - i32::try_from(off_dst).unwrap()], eax; + pop rax + ), + e => todo!("{:?}", e), }, e => todo!("{:?}", e), } @@ -1837,42 +1865,34 @@ impl<'a> Assemble<'a> { // somewhere else (register/normal stack) so dst and src no longer // match. But since the value can't change we can safely ignore this. } - VarLocation::Register(reg) => { - // FIXME: This is completely broken, only works by accident and, probably, - // doesn't even always work. Continually running the register allocator in this - // way means we can end up clobbering values and, because this is the last - // instruction in the trace, none of the values will be used after this, so - // they don't have to be spilled. - match reg { - reg_alloc::Register::GP(r) => { - let [_] = self.ra.assign_gp_regs( - &mut self.asm, - iidx, - [RegConstraint::InputIntoReg(op.clone(), r)], - ); - } - reg_alloc::Register::FP(r) => { - let [_] = self.ra.assign_fp_regs( - &mut self.asm, - iidx, - [RegConstraint::InputIntoReg(op.clone(), r)], - ); - } + VarLocation::Register(reg) => match reg { + reg_alloc::Register::GP(r) => { + gp_regs[usize::from(r.code())] = RegConstraint::InputIntoReg(op.clone(), r); } - } + reg_alloc::Register::FP(r) => { + fp_regs[usize::from(r.code())] = RegConstraint::InputIntoReg(op.clone(), r); + } + }, _ => todo!(), } } + + let _: [_; lsregalloc::GP_REGS.len()] = + self.ra + .assign_gp_regs(&mut self.asm, iidx, gp_regs.try_into().unwrap()); + let _: [_; lsregalloc::FP_REGS.len()] = + self.ra + .assign_fp_regs(&mut self.asm, iidx, fp_regs.try_into().unwrap()); } - fn cg_traceloopjump(&mut self) { + fn cg_body_end(&mut self, iidx: InstIdx) { // Loop the JITted code if the `tloop_start` label is present (not relevant for IR created // by a test or a side-trace). let label = StaticLabel::global("tloop_start"); match self.asm.labels().resolve_static(&label) { Ok(_) => { // Found the label, emit a jump to it. - self.write_jump_vars(None); + self.write_jump_vars(iidx, false); dynasm!(self.asm; jmp ->tloop_start); } Err(DynasmError::UnknownLabel(_)) => { @@ -1893,10 +1913,10 @@ impl<'a> Assemble<'a> { } } - fn cg_rootjump(&mut self, addr: *const libc::c_void) { + fn cg_sidetrace_end(&mut self, iidx: InstIdx, addr: *const libc::c_void) { // The end of a side-trace. Map live variables of this side-trace to the entry variables of // the root parent trace, then jump to it. - self.write_jump_vars(Some(self.m.root_entry_vars())); + self.write_jump_vars(iidx, true); self.ra.align_stack(SYSV_CALL_STACK_ALIGN); dynasm!(self.asm @@ -1909,19 +1929,81 @@ impl<'a> Assemble<'a> { ; jmp rdi); } - fn cg_traceloopstart(&mut self) { - debug_assert_eq!(self.loop_start_locs.len(), 0); - // Remember the locations of the live variables at the beginning of the trace. When we loop - // back around here we need to write the live variables back into these same locations. - for var in self.m.loop_start_vars() { - let loc = match var { - Operand::Var(iidx) => self.ra.var_location(*iidx), + fn cg_header_start(&mut self) { + debug_assert_eq!(self.header_start_locs.len(), 0); + // Remember the locations of the live variables at the beginning of the trace. When we + // re-enter the trace from a side-trace, we need to write the live variables back into + // these same locations. + for var in self.m.trace_header_start() { + let loc = match var.unpack(self.m) { + Operand::Var(iidx) => self.ra.var_location(iidx), _ => panic!(), }; - self.loop_start_locs.push(loc); + self.header_start_locs.push(loc); + } + dynasm!(self.asm; ->reentry:); + self.prologue_offset = self.asm.offset(); + } + + fn cg_header_end(&mut self) { + // FIXME: This is a bit of a roundabout way of doing things. Especially, since it means + // that the [ParamInst]s in the trace body are just placeholders. While, since a recent + // change, the register allocator makes sure the values automatically end up in the + // [VarLocation]s expected by the loop start, this only works for registers right now. We + // can extend this to spill locations as well, but won't be able to do so for variables + // that have become constants during the trace header. So we will always have to either + // update the [ParamInst]s of the trace body, which isn't ideal since it requires the + // [Module] the be mutable. Or we do what we do below just for constants. + let mut varlocs = Vec::new(); + for var in self.m.trace_header_end().iter() { + let varloc = self.op_to_var_location(var.unpack(self.m)); + varlocs.push(varloc); + } + // Reset the register allocator before priming it with information about the trace body + // inputs. + self.ra.reset(); + for (i, op) in self.m.trace_body_start().iter().enumerate() { + // By definition these can only be variables. + let iidx = match op.unpack(self.m) { + Operand::Var(iidx) => iidx, + _ => panic!(), + }; + let varloc = varlocs[i]; + + // Write the varlocations from the head jump to the body start. + // FIXME: This is copied verbatim from `cg_param` and can be reused. + match varloc { + VarLocation::Register(reg_alloc::Register::GP(reg)) => { + self.ra.force_assign_inst_gp_reg(&mut self.asm, iidx, reg); + } + VarLocation::Register(reg_alloc::Register::FP(reg)) => { + self.ra.force_assign_inst_fp_reg(iidx, reg); + } + VarLocation::Direct { frame_off, size: _ } => { + self.ra.force_assign_inst_direct(iidx, frame_off); + } + VarLocation::Stack { frame_off, size: _ } => { + self.ra + .force_assign_inst_indirect(iidx, i32::try_from(frame_off).unwrap()); + } + VarLocation::ConstInt { bits, v } => { + self.ra.assign_const(iidx, bits, v); + } + e => panic!("{:?}", e), + } + } + self.skip_params = true; + } + + fn cg_body_start(&mut self) { + debug_assert_eq!(self.body_start_locs.len(), 0); + // Remember the locations of the live variables at the beginning of the trace loop. When we + // loop back around here we need to write the live variables back into these same + // locations. + for var in self.m.trace_body_start() { + let loc = self.op_to_var_location(var.unpack(self.m)); + self.body_start_locs.push(loc); } - // FIXME: peel the initial iteration of the loop to allow us to hoist loop invariants. - // When doing so, update the jump target inside side-traces. dynasm!(self.asm; ->tloop_start:); } @@ -2293,9 +2375,10 @@ impl CompiledTrace for X64CompiledTrace { .collect(); let callframes = deoptinfo.inlined_frames.clone(); - // Calculate the address inside the root trace we want side-traces to jump to. Currently - // this is directly after the prologue. Later we will change this to jump to after the - // preamble and before the peeled loop. + // Calculate the address inside the root trace we want side-traces to jump. Since the + // side-trace finishes at the control point we need to re-enter via the trace header and + // cannot jump back directly into the trace body. + // FIXME: Check if RPython has found a solution to this (if there is any). let root_addr = unsafe { root_ctr.entry().add(root_ctr.prologue_offset) }; // Pass along [GuardIdx]'s of previous guard failures and add this guard failure's @@ -3611,14 +3694,14 @@ mod tests { %0: i16 = param 0 %1: i32 = param 1 %2: i63 = param 2 - tloop_start [%0, %1, %2] + header_start [%0, %1, %2] %4: i1 = eq %0, %0 %5: i1 = eq %1, %1 %6: i1 = eq %2, %2 black_box %4 black_box %5 black_box %6 - tloop_jump [%0, %1, %2] + header_end [%0, %1, %2] ", " ... @@ -3904,7 +3987,7 @@ mod tests { codegen_and_test( " entry: - tloop_jump [] + body_end [] ", " ... @@ -3921,13 +4004,13 @@ mod tests { codegen_and_test( " entry: - tloop_start [] - tloop_jump [] + body_start [] + body_end [] ", " ... - ; tloop_start []: - ; tloop_jump []: + ; body_start [] + ; body_end [] jmp {{target}} ", false, @@ -3940,20 +4023,20 @@ mod tests { " entry: %0: i8 = param 0 - tloop_start [%0] + body_start [%0] %2: i8 = add %0, %0 black_box %2 - tloop_jump [%0] + body_end [%0] ", " ... ; %0: i8 = param ... ... - ; tloop_start [%0]: + ; body_start [%0] ; %2: i8 = add %0, %0 {{_}} {{off}}: ... ... - ; tloop_jump [%0]: + ; body_end [%0] ... {{_}} {{_}}: jmp 0x00000000{{off}} ", @@ -4446,16 +4529,16 @@ mod tests { " entry: %0: i8 = param 0 - tloop_start [%0] + body_start [%0] %2: i8 = 42i8 - tloop_jump [%2] + body_end [%2] ", " ... ; %0: i8 = param ... ... - ; tloop_start [%0]: - ; tloop_jump [42i8]: + ; body_start [%0] + ; body_end [42i8] mov r.64.x, 0x2a jmp ... ", diff --git a/ykrt/src/compile/jitc_yk/codegen/x64/rev_analyse.rs b/ykrt/src/compile/jitc_yk/codegen/x64/rev_analyse.rs index 69fc7db44..760099a7e 100644 --- a/ykrt/src/compile/jitc_yk/codegen/x64/rev_analyse.rs +++ b/ykrt/src/compile/jitc_yk/codegen/x64/rev_analyse.rs @@ -54,8 +54,11 @@ impl<'a> RevAnalyse<'a> { continue; } } - Inst::TraceLoopJump => { - self.an_tloop_jump(); + Inst::TraceBodyEnd => { + self.an_body_end(); + } + Inst::SidetraceEnd => { + self.an_sidetrace_end(); } Inst::SExt(x) => self.an_sext(iidx, x), Inst::ZExt(x) => self.an_zext(iidx, x), @@ -176,7 +179,7 @@ impl<'a> RevAnalyse<'a> { false } - fn an_tloop_jump(&mut self) { + fn an_body_end(&mut self) { let mut param_vlocs = Vec::new(); for (iidx, inst) in self.m.iter_skipping_insts() { match inst { @@ -191,15 +194,28 @@ impl<'a> RevAnalyse<'a> { } } - debug_assert_eq!(param_vlocs.len(), self.m.loop_jump_operands().len()); + debug_assert_eq!(param_vlocs.len(), self.m.trace_body_end().len()); - for (param_vloc, jump_op) in param_vlocs.into_iter().zip(self.m.loop_jump_operands()) { + for (param_vloc, jump_op) in param_vlocs.into_iter().zip(self.m.trace_body_end()) { if let Operand::Var(op_iidx) = jump_op.unpack(self.m) { self.vloc_hints[usize::from(op_iidx)] = Some(param_vloc); } } } + fn an_sidetrace_end(&mut self) { + let vlocs = self.m.root_entry_vars(); + // Side-traces don't have a trace body since we don't apply loop peeling and thus use + // `trace_header_end` to store the jump variables. + debug_assert_eq!(vlocs.len(), self.m.trace_header_end().len()); + + for (vloc, jump_op) in vlocs.iter().zip(self.m.trace_header_end()) { + if let Operand::Var(op_iidx) = jump_op.unpack(self.m) { + self.vloc_hints[usize::from(op_iidx)] = Some(*vloc); + } + } + } + fn an_sext(&mut self, iidx: InstIdx, seinst: SExtInst) { if let Operand::Var(op_iidx) = seinst.val(self.m) { self.vloc_hints[usize::from(op_iidx)] = self.vloc_hints[usize::from(iidx)]; @@ -272,9 +288,9 @@ mod test { " entry: %0: i8 = param 0 - tloop_start [%0] + body_start [%0] %2: i8 = %0 - tloop_jump [%2] + body_end [%2] ", ); let alives = rev_analyse(&m).unwrap().0; @@ -290,11 +306,11 @@ mod test { " entry: %0: i8 = param 0 - tloop_start [%0] + body_start [%0] %2: i8 = add %0, %0 %3: i8 = add %0, %0 %4: i8 = %2 - tloop_jump [%4] + body_end [%4] ", ); let alives = rev_analyse(&m).unwrap().0; diff --git a/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.l b/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.l index bfc3864e7..d91a8ed53 100644 --- a/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.l +++ b/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.l @@ -50,8 +50,10 @@ slt "SLT" srem "SREM" store "STORE" sub "SUB" -tloop_start "TLOOP_START" -tloop_jump "TLOOP_JUMP" +body_start "BODY_START" +body_end "BODY_END" +header_start "HEADER_START" +header_end "HEADER_END" true "TRUE" trunc "TRUNC" udiv "UDIV" diff --git a/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.y b/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.y index c42f72d32..07548a9cb 100644 --- a/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.y +++ b/ykrt/src/compile/jitc_yk/jit_ir/jit_ir.y @@ -146,8 +146,10 @@ Inst -> Result>: | "LOCAL_OPERAND" ":" Type "=" Operand { Ok(ASTInst::Assign{assign: $1?.span(), val: $5? }) } - | "TLOOP_START" "[" OperandsList "]" { Ok(ASTInst::TraceLoopStart($3?)) } - | "TLOOP_JUMP" "[" OperandsList "]" { Ok(ASTInst::TraceLoopJump($3?)) } + | "HEADER_START" "[" OperandsList "]" { Ok(ASTInst::TraceHeaderStart($3?)) } + | "HEADER_END" "[" OperandsList "]" { Ok(ASTInst::TraceHeaderEnd($3?)) } + | "BODY_START" "[" OperandsList "]" { Ok(ASTInst::TraceBodyStart($3?)) } + | "BODY_END" "[" OperandsList "]" { Ok(ASTInst::TraceBodyEnd($3?)) } ; Operand -> Result>: diff --git a/ykrt/src/compile/jitc_yk/jit_ir/mod.rs b/ykrt/src/compile/jitc_yk/jit_ir/mod.rs index 43db8b6ce..e7ae6c4bf 100644 --- a/ykrt/src/compile/jitc_yk/jit_ir/mod.rs +++ b/ykrt/src/compile/jitc_yk/jit_ir/mod.rs @@ -81,6 +81,14 @@ //! will be in. Canonicalisation is a weak promise, not a guarantee: later stages still have to //! deal with the other cases, but since they're mostly expected not to occur, they may be handled //! suboptimally if that makes the code easier. +//! +//! ## Side traces +//! +//! While we apply loop peeling to normal traces, this doesn't make sense for side-traces which +//! thus don't have [Inst::TraceHeaderStart], [Inst::TraceHeaderEnd], [Inst::TraceBodyStart], and +//! [Inst::TraceBodyEnd] instructions. Instead, side-traces have a single [Inst::SidetraceEnd] +//! instruction at the end of the trace. The operands for this label are stored in +//! [Module::trace_header_end]. mod dead_code; #[cfg(test)] @@ -160,11 +168,15 @@ pub(crate) struct Module { indirect_calls: Vec, /// Live variables at the beginning of the root trace. root_entry_vars: Vec, - /// Live variables at the beginning of the loop. - loop_start_vars: Vec, - /// The ordered sequence of operands at the end of the loop: there will be one per [Operand] at - /// the start of the loop. - loop_jump_operands: Vec, + /// Live variables at the beginning of the trace body. + trace_body_start: Vec, + /// The ordered sequence of operands at the end of the trace body: there will be one per + /// [Operand] at the start of the loop. + trace_body_end: Vec, + /// Live variables at the beginning the trace header. + trace_header_start: Vec, + /// Live variables at the end of the trace header. + trace_header_end: Vec, /// The virtual address of the global variable pointer array. /// /// This is an array (added to the LLVM AOT module and AOT codegenned by ykllvm) containing a @@ -256,9 +268,11 @@ impl Module { global_decls: IndexSet::new(), guard_info: Vec::new(), indirect_calls: Vec::new(), - loop_start_vars: Vec::new(), - loop_jump_operands: Vec::new(), root_entry_vars: Vec::new(), + trace_body_start: Vec::new(), + trace_body_end: Vec::new(), + trace_header_start: Vec::new(), + trace_header_end: Vec::new(), #[cfg(not(test))] globalvar_ptrs, }) @@ -576,12 +590,20 @@ impl Module { GuardInfoIdx::try_from(self.guard_info.len()).inspect(|_| self.guard_info.push(info)) } - pub(crate) fn loop_start_vars(&self) -> &[Operand] { - &self.loop_start_vars + pub(crate) fn trace_header_end(&self) -> &[PackedOperand] { + &self.trace_header_end + } + + pub(crate) fn trace_header_start(&self) -> &[PackedOperand] { + &self.trace_header_start + } + + pub(crate) fn trace_body_start(&self) -> &[PackedOperand] { + &self.trace_body_start } - pub(crate) fn push_loop_start_var(&mut self, op: Operand) { - self.loop_start_vars.push(op); + pub(crate) fn push_body_start_var(&mut self, op: Operand) { + self.trace_header_start.push(PackedOperand::new(&op)); } /// Store the entry live variables of the root traces so we can copy this side-trace's live @@ -591,8 +613,8 @@ impl Module { } /// Return the loop jump operands. - pub(crate) fn loop_jump_operands(&self) -> &[PackedOperand] { - &self.loop_jump_operands + pub(crate) fn trace_body_end(&self) -> &[PackedOperand] { + &self.trace_body_end } /// Get the entry live variables of the root trace. @@ -600,8 +622,8 @@ impl Module { &self.root_entry_vars } - pub(crate) fn push_loop_jump_var(&mut self, op: Operand) { - self.loop_jump_operands.push(PackedOperand::new(&op)); + pub(crate) fn push_header_end_var(&mut self, op: Operand) { + self.trace_header_end.push(PackedOperand::new(&op)); } /// Get the address of the root trace. This is where we need jump to at the end of a @@ -840,7 +862,7 @@ index_16bit!(ArgsIdx); /// A constant index. /// /// One of these is an index into the [Module::consts]. -#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)] +#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd)] pub(crate) struct ConstIdx(u16); index_16bit!(ConstIdx); @@ -1104,7 +1126,7 @@ impl PackedOperand { } /// An operand. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, Eq, PartialEq)] pub(crate) enum Operand { /// This operand references another SSA variable. Var(InstIdx), @@ -1377,10 +1399,13 @@ pub(crate) enum Inst { Store(StoreInst), ICmp(ICmpInst), Guard(GuardInst), + /// Marks the point where side-traces reenter the root trace. + TraceHeaderStart, + TraceHeaderEnd, /// Marks the place to loop back to at the end of the JITted code. - TraceLoopStart, - TraceLoopJump, - RootJump, + TraceBodyStart, + TraceBodyEnd, + SidetraceEnd, SExt(SExtInst), ZExt(ZExtInst), @@ -1438,9 +1463,11 @@ impl Inst { Self::Store(..) => m.void_tyidx(), Self::ICmp(_) => m.int1_tyidx(), Self::Guard(..) => m.void_tyidx(), - Self::TraceLoopStart => m.void_tyidx(), - Self::TraceLoopJump => m.void_tyidx(), - Self::RootJump => m.void_tyidx(), + Self::TraceHeaderStart => m.void_tyidx(), + Self::TraceHeaderEnd => m.void_tyidx(), + Self::TraceBodyStart => m.void_tyidx(), + Self::TraceBodyEnd => m.void_tyidx(), + Self::SidetraceEnd => m.void_tyidx(), Self::SExt(si) => si.dest_tyidx(), Self::ZExt(si) => si.dest_tyidx(), Self::BitCast(i) => i.dest_tyidx(), @@ -1484,7 +1511,11 @@ impl Inst { Inst::Copy(x) => m.inst_raw(*x).is_barrier(m), Inst::Guard(_) => true, Inst::Call(_) | Inst::IndirectCall(_) => true, - Inst::TraceLoopStart | Inst::TraceLoopJump | Inst::RootJump => true, + Inst::TraceHeaderStart + | Inst::TraceHeaderEnd + | Inst::TraceBodyStart + | Inst::TraceBodyEnd + | Inst::SidetraceEnd => true, _ => false, } } @@ -1548,18 +1579,28 @@ impl Inst { pop.unpack(m).map_iidx(f); } } - Inst::TraceLoopStart => { - for x in &m.loop_start_vars { - x.map_iidx(f); + Inst::TraceHeaderStart => { + for x in &m.trace_header_start { + x.unpack(m).map_iidx(f); } } - Inst::TraceLoopJump => { - for x in &m.loop_jump_operands { + Inst::TraceHeaderEnd => { + for x in &m.trace_header_end { x.unpack(m).map_iidx(f); } } - Inst::RootJump => { - for x in &m.loop_jump_operands { + Inst::TraceBodyStart => { + for x in &m.trace_body_start { + x.unpack(m).map_iidx(f); + } + } + Inst::TraceBodyEnd => { + for x in &m.trace_body_end { + x.unpack(m).map_iidx(f); + } + } + Inst::SidetraceEnd => { + for x in &m.trace_header_end { x.unpack(m).map_iidx(f); } } @@ -1587,6 +1628,193 @@ impl Inst { } } + /// Duplicate this [Inst] while applying function `f` to each operand. + pub(crate) fn dup_and_remap_locals( + &self, + m: &mut Module, + f: &F, + ) -> Result + where + F: Fn(InstIdx) -> Operand, + { + let mapper = |m: &Module, x: &PackedOperand| match x.unpack(m) { + Operand::Var(iidx) => PackedOperand::new(&f(iidx)), + Operand::Const(_) => *x, + }; + let op_mapper = |x: &Operand| match x { + Operand::Var(iidx) => f(*iidx), + Operand::Const(c) => Operand::Const(*c), + }; + let inst = match self { + #[cfg(test)] + Inst::BlackBox(BlackBoxInst { op }) => { + Inst::BlackBox(BlackBoxInst { op: mapper(m, op) }) + } + Inst::BinOp(BinOpInst { lhs, binop, rhs }) => Inst::BinOp(BinOpInst { + lhs: mapper(m, lhs), + binop: *binop, + rhs: mapper(m, rhs), + }), + Inst::Call(dc) => { + // Clone and map arguments. + let args = dc + .iter_args_idx() + .map(|x| op_mapper(&m.arg(x))) + .collect::>(); + let dc = DirectCallInst::new(m, dc.target, args)?; + Inst::Call(dc) + } + Inst::IndirectCall(iidx) => { + let ic = m.indirect_call(*iidx); + // Clone and map arguments. + let args = ic + .iter_args_idx() + .map(|x| op_mapper(&m.arg(x))) + .collect::>(); + let icnew = IndirectCallInst::new(m, ic.ftyidx, op_mapper(&ic.target(m)), args)?; + let idx = m.push_indirect_call(icnew)?; + Inst::IndirectCall(idx) + } + Inst::Const(c) => Inst::Const(*c), + Inst::Copy(iidx) => match f(*iidx) { + Operand::Var(iidx) => Inst::Copy(iidx), + Operand::Const(cidx) => Inst::Const(cidx), + }, + Inst::DynPtrAdd(inst) => { + let ptr = inst.ptr; + let num_elems = inst.num_elems; + Inst::DynPtrAdd(DynPtrAddInst { + ptr: mapper(m, &ptr), + num_elems: mapper(m, &num_elems), + elem_size: inst.elem_size, + }) + } + Inst::FPToSI(FPToSIInst { val, dest_tyidx }) => Inst::FPToSI(FPToSIInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + Inst::FPExt(FPExtInst { val, dest_tyidx }) => Inst::FPExt(FPExtInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + Inst::FCmp(FCmpInst { lhs, pred, rhs }) => Inst::FCmp(FCmpInst { + lhs: mapper(m, lhs), + pred: *pred, + rhs: mapper(m, rhs), + }), + Inst::Guard(GuardInst { cond, expect, gidx }) => { + let ginfo = &m.guard_info[usize::from(*gidx)]; + let newlives = ginfo + .live_vars() + .iter() + .map(|(aot, jit)| (aot.clone(), mapper(m, jit))) + .collect(); + let inlined_frames = ginfo + .inlined_frames() + .iter() + .map(|x| { + InlinedFrame::new( + x.callinst.clone(), + x.funcidx, + x.safepoint, + x.args + .iter() + .map(|x| op_mapper(&x.unpack(m))) + .collect::>(), + ) + }) + .collect::>(); + let newginfo = GuardInfo::new( + ginfo.bid().clone(), + newlives, + inlined_frames, + ginfo.safepoint_id, + ); + let newgidx = m.push_guardinfo(newginfo).unwrap(); + Inst::Guard(GuardInst { + cond: mapper(m, cond), + expect: *expect, + gidx: newgidx, + }) + } + Inst::ICmp(ICmpInst { lhs, pred, rhs }) => Inst::ICmp(ICmpInst { + lhs: mapper(m, lhs), + pred: *pred, + rhs: mapper(m, rhs), + }), + Inst::Load(LoadInst { + op, + tyidx, + volatile, + }) => Inst::Load(LoadInst { + op: mapper(m, op), + tyidx: *tyidx, + volatile: *volatile, + }), + Inst::Param(x) => Inst::Param(*x), + Inst::LookupGlobal(g) => Inst::LookupGlobal(*g), + Inst::PtrAdd(inst) => { + let ptr = inst.ptr; + Inst::PtrAdd(PtrAddInst { + ptr: mapper(m, &ptr), + off: inst.off, + }) + } + Inst::SidetraceEnd => { + // This instruction only exists in side-traces, which don't have loops we can peel + // off. + unreachable!() + } + Inst::Select(SelectInst { + cond, + trueval, + falseval, + }) => Inst::Select(SelectInst { + cond: mapper(m, cond), + trueval: mapper(m, trueval), + falseval: mapper(m, falseval), + }), + Inst::SExt(SExtInst { val, dest_tyidx }) => Inst::SExt(SExtInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + Inst::SIToFP(SIToFPInst { val, dest_tyidx }) => Inst::SIToFP(SIToFPInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + Inst::Store(StoreInst { tgt, val, volatile }) => Inst::Store(StoreInst { + tgt: mapper(m, tgt), + val: mapper(m, val), + volatile: *volatile, + }), + Inst::Tombstone => Inst::Tombstone, + Inst::TraceHeaderStart => { + // Copy the header label into the body while remapping the operands. + m.trace_body_start = m + .trace_header_start + .iter() + .map(|op| mapper(m, op)) + .collect(); + Inst::TraceBodyStart + } + Inst::TraceHeaderEnd => { + // Copy the header label into the body while remapping the operands. + m.trace_body_end = m.trace_header_end.iter().map(|op| mapper(m, op)).collect(); + Inst::TraceBodyEnd + } + Inst::Trunc(TruncInst { val, dest_tyidx }) => Inst::Trunc(TruncInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + Inst::ZExt(ZExtInst { val, dest_tyidx }) => Inst::ZExt(ZExtInst { + val: mapper(m, val), + dest_tyidx: *dest_tyidx, + }), + e => todo!("{:?}", e), + }; + Ok(inst) + } + /// Returns the size of the local variable that this instruction defines (if any). /// /// # Panics @@ -1766,37 +1994,59 @@ impl fmt::Display for DisplayableInst<'_> { Inst::Param(x) => { write!(f, "param {:?}", self.m.params[usize::from(x.paramidx())]) } - Inst::TraceLoopStart => { + Inst::TraceHeaderStart => { + // Just marks a location, so we format it to look like a label. + write!(f, "header_start [")?; + for var in &self.m.trace_header_start { + write!(f, "{}", var.unpack(self.m).display(self.m))?; + if var != self.m.trace_header_start.last().unwrap() { + write!(f, ", ")?; + } + } + write!(f, "]") + } + Inst::TraceHeaderEnd => { // Just marks a location, so we format it to look like a label. - write!(f, "tloop_start [")?; - for var in &self.m.loop_start_vars { - write!(f, "{}", var.display(self.m))?; - if var != self.m.loop_start_vars.last().unwrap() { + write!(f, "header_end [")?; + for var in &self.m.trace_header_end { + write!(f, "{}", var.unpack(self.m).display(self.m))?; + if var != self.m.trace_header_end.last().unwrap() { + write!(f, ", ")?; + } + } + write!(f, "]") + } + Inst::TraceBodyStart => { + // Just marks a location, so we format it to look like a label. + write!(f, "body_start [")?; + for (i, var) in self.m.trace_body_start.iter().enumerate() { + write!(f, "{}", var.unpack(self.m).display(self.m))?; + if i + 1 < self.m.trace_body_start.len() { write!(f, ", ")?; } } - write!(f, "]:") + write!(f, "]") } - Inst::TraceLoopJump => { + Inst::TraceBodyEnd => { // Just marks a location, so we format it to look like a label. - write!(f, "tloop_jump [")?; - for var in &self.m.loop_jump_operands { + write!(f, "body_end [")?; + for (i, var) in self.m.trace_body_end.iter().enumerate() { write!(f, "{}", var.unpack(self.m).display(self.m))?; - if var != self.m.loop_jump_operands.last().unwrap() { + if i + 1 < self.m.trace_body_end.len() { write!(f, ", ")?; } } - write!(f, "]:") + write!(f, "]") } - Inst::RootJump => { - write!(f, "parent_jump {:?} [", self.m.root_jump_ptr)?; - for var in &self.m.loop_jump_operands { + Inst::SidetraceEnd => { + write!(f, "sidetrace_end {:?} [", self.m.root_jump_ptr)?; + for (i, var) in self.m.trace_header_end.iter().enumerate() { write!(f, "{}", var.unpack(self.m).display(self.m))?; - if var != self.m.loop_jump_operands.last().unwrap() { + if i + 1 < self.m.trace_header_end.len() { write!(f, ", ")?; } } - write!(f, "]:") + write!(f, "]") } Inst::SExt(i) => { write!(f, "sext {}", i.val(self.m).display(self.m),) diff --git a/ykrt/src/compile/jitc_yk/jit_ir/parser.rs b/ykrt/src/compile/jitc_yk/jit_ir/parser.rs index a6c151a44..ae78fec27 100644 --- a/ykrt/src/compile/jitc_yk/jit_ir/parser.rs +++ b/ykrt/src/compile/jitc_yk/jit_ir/parser.rs @@ -438,19 +438,33 @@ impl<'lexer, 'input: 'lexer> JITIRParser<'lexer, 'input, '_> { let inst = BlackBoxInst::new(self.process_operand(op)?); self.m.push(inst.into()).unwrap(); } - ASTInst::TraceLoopStart(ops) => { + ASTInst::TraceBodyStart(ops) => { for op in ops { let op = self.process_operand(op)?; - self.m.loop_start_vars.push(op); + self.m.trace_body_start.push(PackedOperand::new(&op)); } - self.m.push(Inst::TraceLoopStart).unwrap(); + self.m.push(Inst::TraceBodyStart).unwrap(); } - ASTInst::TraceLoopJump(ops) => { + ASTInst::TraceBodyEnd(ops) => { for op in ops { let op = self.process_operand(op)?; - self.m.loop_jump_operands.push(PackedOperand::new(&op)); + self.m.trace_body_end.push(PackedOperand::new(&op)); } - self.m.push(Inst::TraceLoopJump).unwrap(); + self.m.push(Inst::TraceBodyEnd).unwrap(); + } + ASTInst::TraceHeaderStart(ops) => { + for op in ops { + let op = self.process_operand(op)?; + self.m.trace_header_start.push(PackedOperand::new(&op)); + } + self.m.push(Inst::TraceHeaderStart).unwrap(); + } + ASTInst::TraceHeaderEnd(ops) => { + for op in ops { + let op = self.process_operand(op)?; + self.m.trace_header_end.push(PackedOperand::new(&op)); + } + self.m.push(Inst::TraceHeaderEnd).unwrap(); } ASTInst::Trunc { assign, @@ -835,8 +849,10 @@ enum ASTInst { volatile: bool, }, BlackBox(ASTOperand), - TraceLoopStart(Vec), - TraceLoopJump(Vec), + TraceBodyStart(Vec), + TraceBodyEnd(Vec), + TraceHeaderStart(Vec), + TraceHeaderEnd(Vec), Trunc { assign: Span, type_: ASTType, @@ -947,7 +963,7 @@ mod tests { %10: i32 = trunc %8 %11: i32 = add %7, %9 %12: i1 = eq %0, %2 - tloop_start [%0, %6] + body_start [%0, %6] guard true, %12, [%0, %1, %2, 1i8] call @f1() %16: i32 = call @f2(%5) @@ -998,7 +1014,7 @@ mod tests { %61: i64 = icall %9(%5, %7, %0) %62: float = bitcast %7 %63: float = fneg %54 - tloop_jump [%43, %58] + body_end [%43, %58] ", ); } diff --git a/ykrt/src/compile/jitc_yk/jit_ir/well_formed.rs b/ykrt/src/compile/jitc_yk/jit_ir/well_formed.rs index d84a458a9..301887385 100644 --- a/ykrt/src/compile/jitc_yk/jit_ir/well_formed.rs +++ b/ykrt/src/compile/jitc_yk/jit_ir/well_formed.rs @@ -29,10 +29,10 @@ use super::{BinOp, BinOpInst, Const, GuardInst, Inst, Module, Operand, Ty}; impl Module { pub(crate) fn assert_well_formed(&self) { if !self.root_entry_vars.is_empty() { - if self.root_entry_vars.len() != self.loop_jump_operands.len() { + if self.root_entry_vars.len() != self.trace_header_end.len() { panic!("Loop start/end variables have different lengths."); } - } else if self.loop_start_vars.len() != self.loop_jump_operands.len() { + } else if self.trace_header_start.len() != self.trace_header_end.len() { panic!("Loop start/end variables have different lengths."); } @@ -279,8 +279,8 @@ impl Module { } Inst::Param(_) => { if let Some(i) = last_inst { - if !matches!(i, Inst::Param(_)) { - panic!("Param instruction may only appear at the beginning of a trace or after another Param instruction\n {}", + if !matches!(i, Inst::Param(_) | Inst::TraceHeaderEnd) { + panic!("Param instruction may only appear at the beginning of a trace or after another Param instruction, or after the trace header jump\n {}", self.inst(iidx).display(iidx, self)); } } diff --git a/ykrt/src/compile/jitc_yk/opt/analyse.rs b/ykrt/src/compile/jitc_yk/opt/analyse.rs index 3000e370a..05c453afb 100644 --- a/ykrt/src/compile/jitc_yk/opt/analyse.rs +++ b/ykrt/src/compile/jitc_yk/opt/analyse.rs @@ -18,7 +18,11 @@ pub(super) struct Analyse { impl Analyse { pub(super) fn new(m: &Module) -> Analyse { Analyse { - values: vec![Value::Unknown; m.insts_len()], + // When we want to do loop peeling, we don't know actual size of the module at this + // point. What we do know is that it is at most two times the size (though since we + // don't copy over [Tombstone]s and [Copy]s it will be slightly less than that. + // FIXME: Can we calculate this more accurately? + values: vec![Value::Unknown; m.insts_len() * 2], } } diff --git a/ykrt/src/compile/jitc_yk/opt/mod.rs b/ykrt/src/compile/jitc_yk/opt/mod.rs index a53de69fc..a33526467 100644 --- a/ykrt/src/compile/jitc_yk/opt/mod.rs +++ b/ykrt/src/compile/jitc_yk/opt/mod.rs @@ -35,343 +35,403 @@ impl Opt { Self { m, an, instll } } - fn opt(mut self) -> Result { - let skipping = self.m.iter_skipping_insts().collect::>(); - for (iidx, inst) in skipping.into_iter() { - match inst { - #[cfg(test)] - Inst::BlackBox(_) => (), - Inst::Const(_) | Inst::Copy(_) | Inst::Tombstone => unreachable!(), - Inst::BinOp(x) => match x.binop() { - BinOp::Add => match ( - self.an.op_map(&self.m, x.lhs(&self.m)), - self.an.op_map(&self.m, x.rhs(&self.m)), - ) { - (Operand::Const(op_cidx), Operand::Var(op_iidx)) - | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { - match self.m.const_(op_cidx) { - Const::Int(_, 0) => { - // Replace `x + 0` with `x`. - self.m.replace(iidx, Inst::Copy(op_iidx)); - } - _ => { - // Canonicalise to (Var, Const). - self.m.replace( - iidx, - BinOpInst::new( - Operand::Var(op_iidx), - BinOp::Add, - Operand::Const(op_cidx), - ) - .into(), - ); - } + /// Optimise instruction `iidx`. + fn opt_inst(&mut self, iidx: InstIdx) -> Result<(), CompilationError> { + match self.m.inst(iidx) { + #[cfg(test)] + Inst::BlackBox(_) => (), + Inst::Const(_) | Inst::Copy(_) | Inst::Tombstone => unreachable!(), + Inst::BinOp(x) => match x.binop() { + BinOp::Add => match ( + self.an.op_map(&self.m, x.lhs(&self.m)), + self.an.op_map(&self.m, x.rhs(&self.m)), + ) { + (Operand::Const(op_cidx), Operand::Var(op_iidx)) + | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { + match self.m.const_(op_cidx) { + Const::Int(_, 0) => { + // Replace `x + 0` with `x`. + self.m.replace(iidx, Inst::Copy(op_iidx)); + } + _ => { + // Canonicalise to (Var, Const). + self.m.replace( + iidx, + BinOpInst::new( + Operand::Var(op_iidx), + BinOp::Add, + Operand::Const(op_cidx), + ) + .into(), + ); } } - (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { - match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { - (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { - debug_assert_eq!(lhs_tyidx, rhs_tyidx); - let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { - panic!() - }; - let cidx = self.m.insert_const_int( - *lhs_tyidx, - (lhs_v.wrapping_add(*rhs_v)).truncate(*bits), - )?; - self.m.replace(iidx, Inst::Const(cidx)); - } - _ => todo!(), + } + (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { + match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { + (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { + debug_assert_eq!(lhs_tyidx, rhs_tyidx); + let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { + panic!() + }; + let cidx = self.m.insert_const_int( + *lhs_tyidx, + (lhs_v.wrapping_add(*rhs_v)).truncate(*bits), + )?; + self.m.replace(iidx, Inst::Const(cidx)); } + _ => todo!(), } - (Operand::Var(_), Operand::Var(_)) => (), - }, - BinOp::And => match ( - self.an.op_map(&self.m, x.lhs(&self.m)), - self.an.op_map(&self.m, x.rhs(&self.m)), - ) { - (Operand::Const(op_cidx), Operand::Var(op_iidx)) - | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { - match self.m.const_(op_cidx) { - Const::Int(_, 0) => { - // Replace `x & 0` with `0`. - self.m.replace(iidx, Inst::Const(op_cidx)); - } - _ => { - // Canonicalise to (Var, Const). - self.m.replace( - iidx, - BinOpInst::new( - Operand::Var(op_iidx), - BinOp::And, - Operand::Const(op_cidx), - ) - .into(), - ); - } + } + (Operand::Var(_), Operand::Var(_)) => (), + }, + BinOp::And => match ( + self.an.op_map(&self.m, x.lhs(&self.m)), + self.an.op_map(&self.m, x.rhs(&self.m)), + ) { + (Operand::Const(op_cidx), Operand::Var(op_iidx)) + | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { + match self.m.const_(op_cidx) { + Const::Int(_, 0) => { + // Replace `x & 0` with `0`. + self.m.replace(iidx, Inst::Const(op_cidx)); + } + _ => { + // Canonicalise to (Var, Const). + self.m.replace( + iidx, + BinOpInst::new( + Operand::Var(op_iidx), + BinOp::And, + Operand::Const(op_cidx), + ) + .into(), + ); } } - (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { - match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { - (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { - debug_assert_eq!(lhs_tyidx, rhs_tyidx); - let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { - panic!() - }; - let cidx = self.m.insert_const_int( - *lhs_tyidx, - (lhs_v & rhs_v).truncate(*bits), - )?; - self.m.replace(iidx, Inst::Const(cidx)); - } - _ => todo!(), + } + (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { + match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { + (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { + debug_assert_eq!(lhs_tyidx, rhs_tyidx); + let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { + panic!() + }; + let cidx = self.m.insert_const_int( + *lhs_tyidx, + (lhs_v & rhs_v).truncate(*bits), + )?; + self.m.replace(iidx, Inst::Const(cidx)); } + _ => todo!(), } - (Operand::Var(_), Operand::Var(_)) => (), - }, - BinOp::LShr => match ( - self.an.op_map(&self.m, x.lhs(&self.m)), - self.an.op_map(&self.m, x.rhs(&self.m)), - ) { - (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { - match self.m.const_(op_cidx) { - Const::Int(_, 0) => { - // Replace `x >> 0` with `x`. - self.m.replace(iidx, Inst::Copy(op_iidx)); - } - _ => { - // Canonicalise to (Var, Const). - self.m.replace( - iidx, - BinOpInst::new( - Operand::Var(op_iidx), - BinOp::LShr, - Operand::Const(op_cidx), - ) - .into(), - ); - } + } + (Operand::Var(_), Operand::Var(_)) => (), + }, + BinOp::LShr => match ( + self.an.op_map(&self.m, x.lhs(&self.m)), + self.an.op_map(&self.m, x.rhs(&self.m)), + ) { + (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { + match self.m.const_(op_cidx) { + Const::Int(_, 0) => { + // Replace `x >> 0` with `x`. + self.m.replace(iidx, Inst::Copy(op_iidx)); + } + _ => { + // Canonicalise to (Var, Const). + self.m.replace( + iidx, + BinOpInst::new( + Operand::Var(op_iidx), + BinOp::LShr, + Operand::Const(op_cidx), + ) + .into(), + ); } } - (Operand::Const(_), Operand::Var(_)) => (), - (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { - match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { - (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { - debug_assert_eq!(lhs_tyidx, rhs_tyidx); - let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { - panic!() - }; - let cidx = self.m.insert_const_int( - *lhs_tyidx, - (lhs_v >> rhs_v).truncate(*bits), - )?; - self.m.replace(iidx, Inst::Const(cidx)); - } - _ => todo!(), + } + (Operand::Const(_), Operand::Var(_)) => (), + (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { + match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { + (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { + debug_assert_eq!(lhs_tyidx, rhs_tyidx); + let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { + panic!() + }; + let cidx = self.m.insert_const_int( + *lhs_tyidx, + (lhs_v >> rhs_v).truncate(*bits), + )?; + self.m.replace(iidx, Inst::Const(cidx)); } + _ => todo!(), } - (Operand::Var(_), Operand::Var(_)) => (), - }, - BinOp::Mul => match ( - self.an.op_map(&self.m, x.lhs(&self.m)), - self.an.op_map(&self.m, x.rhs(&self.m)), - ) { - (Operand::Const(op_cidx), Operand::Var(op_iidx)) - | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { - match self.m.const_(op_cidx) { - Const::Int(_, 0) => { - // Replace `x * 0` with `0`. - self.m.replace(iidx, Inst::Const(op_cidx)); - } - Const::Int(_, 1) => { - // Replace `x * 1` with `x`. - self.m.replace(iidx, Inst::Copy(op_iidx)); - } - Const::Int(ty_idx, x) if x.is_power_of_two() => { - // Replace `x * y` with `x << ...`. - let shl = u64::from(x.ilog2()); - let shl_op = Operand::Const( - self.m.insert_const(Const::Int(*ty_idx, shl))?, - ); - let new_inst = - BinOpInst::new(Operand::Var(op_iidx), BinOp::Shl, shl_op) - .into(); - self.m.replace(iidx, new_inst); - } - _ => { - // Canonicalise to (Var, Const). - self.m.replace( - iidx, - BinOpInst::new( - Operand::Var(op_iidx), - BinOp::Mul, - Operand::Const(op_cidx), - ) - .into(), - ); - } + } + (Operand::Var(_), Operand::Var(_)) => (), + }, + BinOp::Mul => match ( + self.an.op_map(&self.m, x.lhs(&self.m)), + self.an.op_map(&self.m, x.rhs(&self.m)), + ) { + (Operand::Const(op_cidx), Operand::Var(op_iidx)) + | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { + match self.m.const_(op_cidx) { + Const::Int(_, 0) => { + // Replace `x * 0` with `0`. + self.m.replace(iidx, Inst::Const(op_cidx)); + } + Const::Int(_, 1) => { + // Replace `x * 1` with `x`. + self.m.replace(iidx, Inst::Copy(op_iidx)); + } + Const::Int(ty_idx, x) if x.is_power_of_two() => { + // Replace `x * y` with `x << ...`. + let shl = u64::from(x.ilog2()); + let shl_op = + Operand::Const(self.m.insert_const(Const::Int(*ty_idx, shl))?); + let new_inst = + BinOpInst::new(Operand::Var(op_iidx), BinOp::Shl, shl_op) + .into(); + self.m.replace(iidx, new_inst); + } + _ => { + // Canonicalise to (Var, Const). + self.m.replace( + iidx, + BinOpInst::new( + Operand::Var(op_iidx), + BinOp::Mul, + Operand::Const(op_cidx), + ) + .into(), + ); } } - (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { - match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { - (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { - debug_assert_eq!(lhs_tyidx, rhs_tyidx); - let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { - panic!() - }; - let cidx = self.m.insert_const_int( - *lhs_tyidx, - (lhs_v.wrapping_mul(*rhs_v)).truncate(*bits), - )?; - self.m.replace(iidx, Inst::Const(cidx)); - } - _ => todo!(), + } + (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { + match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { + (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { + debug_assert_eq!(lhs_tyidx, rhs_tyidx); + let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { + panic!() + }; + let cidx = self.m.insert_const_int( + *lhs_tyidx, + (lhs_v.wrapping_mul(*rhs_v)).truncate(*bits), + )?; + self.m.replace(iidx, Inst::Const(cidx)); } + _ => todo!(), } - (Operand::Var(_), Operand::Var(_)) => (), - }, - BinOp::Or => match ( - self.an.op_map(&self.m, x.lhs(&self.m)), - self.an.op_map(&self.m, x.rhs(&self.m)), - ) { - (Operand::Const(op_cidx), Operand::Var(op_iidx)) - | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { - match self.m.const_(op_cidx) { - Const::Int(_, 0) => { - // Replace `x | 0` with `x`. - self.m.replace(iidx, Inst::Copy(op_iidx)); - } - _ => { - // Canonicalise to (Var, Const). - self.m.replace( - iidx, - BinOpInst::new( - Operand::Var(op_iidx), - BinOp::Or, - Operand::Const(op_cidx), - ) - .into(), - ); - } + } + (Operand::Var(_), Operand::Var(_)) => (), + }, + BinOp::Or => match ( + self.an.op_map(&self.m, x.lhs(&self.m)), + self.an.op_map(&self.m, x.rhs(&self.m)), + ) { + (Operand::Const(op_cidx), Operand::Var(op_iidx)) + | (Operand::Var(op_iidx), Operand::Const(op_cidx)) => { + match self.m.const_(op_cidx) { + Const::Int(_, 0) => { + // Replace `x | 0` with `x`. + self.m.replace(iidx, Inst::Copy(op_iidx)); + } + _ => { + // Canonicalise to (Var, Const). + self.m.replace( + iidx, + BinOpInst::new( + Operand::Var(op_iidx), + BinOp::Or, + Operand::Const(op_cidx), + ) + .into(), + ); } } - (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { - match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { - (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { - debug_assert_eq!(lhs_tyidx, rhs_tyidx); - let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { - panic!() - }; - let cidx = self.m.insert_const_int( - *lhs_tyidx, - (lhs_v | rhs_v).truncate(*bits), - )?; - self.m.replace(iidx, Inst::Const(cidx)); - } - _ => todo!(), + } + (Operand::Const(lhs_cidx), Operand::Const(rhs_cidx)) => { + match (self.m.const_(lhs_cidx), self.m.const_(rhs_cidx)) { + (Const::Int(lhs_tyidx, lhs_v), Const::Int(rhs_tyidx, rhs_v)) => { + debug_assert_eq!(lhs_tyidx, rhs_tyidx); + let Ty::Integer(bits) = self.m.type_(*lhs_tyidx) else { + panic!() + }; + let cidx = self.m.insert_const_int( + *lhs_tyidx, + (lhs_v | rhs_v).truncate(*bits), + )?; + self.m.replace(iidx, Inst::Const(cidx)); } + _ => todo!(), } - (Operand::Var(_), Operand::Var(_)) => (), - }, - _ => (), - }, - Inst::DynPtrAdd(x) => { - if let Operand::Const(cidx) = self.an.op_map(&self.m, x.num_elems(&self.m)) { - let Const::Int(_, v) = self.m.const_(cidx) else { - panic!() - }; - // DynPtrAdd indices are signed, so we have to be careful to interpret the - // constant as such. - let v = *v as i64; - // LLVM IR allows `off` to be an `i64` but our IR currently allows only an - // `i32`. On that basis, we can hit our limits before the program has - // itself hit UB, at which point we can't go any further. - let off = i32::try_from(v) - .map_err(|_| ()) - .and_then(|v| v.checked_mul(i32::from(x.elem_size())).ok_or(())) - .map_err(|_| { - CompilationError::LimitExceeded( - "`DynPtrAdd` offset exceeded `i32` bounds".into(), - ) - })?; - self.m - .replace(iidx, Inst::PtrAdd(PtrAddInst::new(x.ptr(&self.m), off))); } + (Operand::Var(_), Operand::Var(_)) => (), + }, + _ => (), + }, + Inst::DynPtrAdd(x) => { + if let Operand::Const(cidx) = self.an.op_map(&self.m, x.num_elems(&self.m)) { + let Const::Int(_, v) = self.m.const_(cidx) else { + panic!() + }; + // DynPtrAdd indices are signed, so we have to be careful to interpret the + // constant as such. + let v = *v as i64; + // LLVM IR allows `off` to be an `i64` but our IR currently allows only an + // `i32`. On that basis, we can hit our limits before the program has + // itself hit UB, at which point we can't go any further. + let off = i32::try_from(v) + .map_err(|_| ()) + .and_then(|v| v.checked_mul(i32::from(x.elem_size())).ok_or(())) + .map_err(|_| { + CompilationError::LimitExceeded( + "`DynPtrAdd` offset exceeded `i32` bounds".into(), + ) + })?; + self.m + .replace(iidx, Inst::PtrAdd(PtrAddInst::new(x.ptr(&self.m), off))); } - Inst::ICmp(x) => { - self.icmp(iidx, x); + } + Inst::ICmp(x) => { + self.icmp(iidx, x); + } + Inst::Guard(x) => { + if let Operand::Const(_) = self.an.op_map(&self.m, x.cond(&self.m)) { + // A guard that references a constant is, by definition, not needed and + // doesn't affect future analyses. + self.m.replace(iidx, Inst::Tombstone); + } else { + self.an.guard(&self.m, x); } - Inst::Guard(x) => { - if let Operand::Const(_) = self.an.op_map(&self.m, x.cond(&self.m)) { - // A guard that references a constant is, by definition, not needed and - // doesn't affect future analyses. - self.m.replace(iidx, Inst::Tombstone); - } else { - self.an.guard(&self.m, x); + } + Inst::PtrAdd(x) => match self.an.op_map(&self.m, x.ptr(&self.m)) { + Operand::Const(_) => todo!(), + Operand::Var(op_iidx) => { + if x.off() == 0 { + self.m.replace(iidx, Inst::Copy(op_iidx)); } } - Inst::PtrAdd(x) => match self.an.op_map(&self.m, x.ptr(&self.m)) { - Operand::Const(_) => todo!(), - Operand::Var(op_iidx) => { - if x.off() == 0 { - self.m.replace(iidx, Inst::Copy(op_iidx)); - } - } - }, - Inst::SExt(x) => { - if let Operand::Const(cidx) = self.an.op_map(&self.m, x.val(&self.m)) { - let Const::Int(src_ty, src_val) = self.m.const_(cidx) else { - unreachable!() - }; - let src_ty = self.m.type_(*src_ty); + }, + Inst::SExt(x) => { + if let Operand::Const(cidx) = self.an.op_map(&self.m, x.val(&self.m)) { + let Const::Int(src_ty, src_val) = self.m.const_(cidx) else { + unreachable!() + }; + let src_ty = self.m.type_(*src_ty); + let dst_ty = self.m.type_(x.dest_tyidx()); + let (Ty::Integer(src_bits), Ty::Integer(dst_bits)) = (src_ty, dst_ty) else { + unreachable!() + }; + let dst_val = match (src_bits, dst_bits) { + (32, 64) => Const::Int(x.dest_tyidx(), src_val.sign_extend(32, 64)), + _ => todo!("{src_bits} {dst_bits}"), + }; + let dst_cidx = self.m.insert_const(dst_val)?; + self.m.replace(iidx, Inst::Const(dst_cidx)); + } + } + Inst::Param(x) => { + // FIXME: This feels like it should be handled by trace_builder, but we can't + // do so yet because of https://github.com/ykjit/yk/issues/1435. + if let yksmp::Location::Constant(v) = self.m.param(x.paramidx()) { + let cidx = self.m.insert_const(Const::Int(x.tyidx(), u64::from(*v)))?; + self.an.set_value(iidx, Value::Const(cidx)); + } + } + Inst::ZExt(x) => { + if let Operand::Const(cidx) = self.an.op_map(&self.m, x.val(&self.m)) { + let Const::Int(_src_ty, src_val) = self.m.const_(cidx) else { + unreachable!() + }; + #[cfg(debug_assertions)] + { + let src_ty = self.m.type_(*_src_ty); let dst_ty = self.m.type_(x.dest_tyidx()); let (Ty::Integer(src_bits), Ty::Integer(dst_bits)) = (src_ty, dst_ty) else { unreachable!() }; - let dst_val = match (src_bits, dst_bits) { - (32, 64) => Const::Int(x.dest_tyidx(), src_val.sign_extend(32, 64)), - _ => todo!("{src_bits} {dst_bits}"), - }; - let dst_cidx = self.m.insert_const(dst_val)?; - self.m.replace(iidx, Inst::Const(dst_cidx)); - } - } - Inst::Param(x) => { - // FIXME: This feels like it should be handled by trace_builder, but we can't - // do so yet because of https://github.com/ykjit/yk/issues/1435. - if let yksmp::Location::Constant(v) = self.m.param(x.paramidx()) { - let cidx = self.m.insert_const(Const::Int(x.tyidx(), u64::from(*v)))?; - self.an.set_value(iidx, Value::Const(cidx)); + debug_assert!(src_bits <= dst_bits); + debug_assert!(*dst_bits <= 64); } + let dst_cidx = self.m.insert_const(Const::Int(x.dest_tyidx(), *src_val))?; + self.m.replace(iidx, Inst::Const(dst_cidx)); } - Inst::ZExt(x) => { - if let Operand::Const(cidx) = self.an.op_map(&self.m, x.val(&self.m)) { - let Const::Int(_src_ty, src_val) = self.m.const_(cidx) else { - unreachable!() - }; - #[cfg(debug_assertions)] - { - let src_ty = self.m.type_(*_src_ty); - let dst_ty = self.m.type_(x.dest_tyidx()); - let (Ty::Integer(src_bits), Ty::Integer(dst_bits)) = (src_ty, dst_ty) - else { - unreachable!() - }; - debug_assert!(src_bits <= dst_bits); - debug_assert!(*dst_bits <= 64); - } - let dst_cidx = self.m.insert_const(Const::Int(x.dest_tyidx(), *src_val))?; - self.m.replace(iidx, Inst::Const(dst_cidx)); - } - } - _ => (), } + _ => (), + }; + Ok(()) + } + + fn opt(mut self) -> Result { + let base = self.m.insts_len(); + // The instruction offset after all `loadti` instructions. + let is_sidetrace = matches!(self.m.inst(self.m.last_inst_idx()), Inst::SidetraceEnd); + + // Disable loop peeling if there is no `header_end` and we are running tests. + #[cfg(test)] + let disable_peel = !matches!(self.m.inst(self.m.last_inst_idx()), Inst::TraceHeaderEnd); + + // Note that since we will apply loop peeling here, the list of instructions grows as this + // loop runs. Each instruction we process is (after optimisations were applied), duplicated + // and copied to the end of the module. + let skipping = self.m.iter_skipping_insts().collect::>(); + for (iidx, _inst) in skipping.into_iter() { + self.opt_inst(iidx)?; self.cse(iidx); } // FIXME: When code generation supports backwards register allocation, we won't need to // explicitly perform dead code elimination and this function can be made `#[cfg(test)]` only. self.m.dead_code_elimination(); + + #[cfg(test)] + if disable_peel { + return Ok(self.m); + } + + if is_sidetrace { + // Side-traces don't loop and thus cannot be peeled. + return Ok(self.m); + } + + // Now that we've processed the trace header, duplicate it to create the loop body. + // FIXME: Do we need to call `iter_skipping_inst_idxs` again? + // Maps header instructions to their position in the body. + let mut iidx_map = vec![0; base]; + let skipping = self.m.iter_skipping_insts().collect::>(); + for (iidx, inst) in skipping.into_iter() { + let c = inst.dup_and_remap_locals(&mut self.m, &|i: InstIdx| { + let newiidx = iidx_map[usize::from(i)]; + Operand::Var(InstIdx::try_from(newiidx).unwrap()) + })?; + let copyiidx = self.m.push(c)?; + iidx_map[usize::from(iidx)] = usize::from(copyiidx); + if let Inst::TraceHeaderStart = inst { + for (headop, bodyop) in self + .m + .trace_header_end() + .iter() + .zip(self.m.trace_body_start()) + { + // Inform the analyser about any constants being passed from the header into + // the body. + if let Operand::Const(cidx) = headop.unpack(&self.m) { + let Operand::Var(op_iidx) = bodyop.unpack(&self.m) else { + panic!() + }; + self.an.set_value(op_iidx, Value::Const(cidx)); + } + } + } + self.opt_inst(copyiidx)?; + } + + // FIXME: Apply CSE and run another pass of optimisations on the peeled loop. + self.m.dead_code_elimination(); Ok(self.m) } @@ -1054,4 +1114,60 @@ mod test { ", ); } + + #[test] + fn opt_peeling_simple() { + Module::assert_ir_transform_eq( + " + entry: + %0: i8 = param 0 + header_start [%0] + %2: i8 = add %0, 1i8 + header_end [%2] + ", + |m| opt(m).unwrap(), + " + ... + entry: + %0: i8 = param ... + header_start [%0] + %2: i8 = add %0, 1i8 + header_end [%2] + %4: i8 = param ... + body_start [%4] + %6: i8 = add %4, 1i8 + body_end [%6] + ", + ); + } + + #[ignore] + #[test] + fn opt_peeling_hoist() { + Module::assert_ir_transform_eq( + " + entry: + %0: i8 = param 0 + %1: i8 = param 1 + body_start [%0, %1] + %2: i8 = add %0, 1i8 + %3: i8 = add %1, %2 + body_end [%0, %3] + ", + |m| opt(m).unwrap(), + " + ... + entry: + %0: i8 = param ... + %1: i8 = param ... + head_start [%0, %1] + %3: i8 = add %0, 1i8 + %4: i8 = add %1, %3 + head_end [%0, %4, %3] + body_start [%0, %4, %3] + %10: i8 = add %4, %3 + body_end [%0, %10, %3] + ", + ); + } } diff --git a/ykrt/src/compile/jitc_yk/trace_builder.rs b/ykrt/src/compile/jitc_yk/trace_builder.rs index c47c2cba1..36431fb17 100644 --- a/ykrt/src/compile/jitc_yk/trace_builder.rs +++ b/ykrt/src/compile/jitc_yk/trace_builder.rs @@ -131,9 +131,9 @@ impl TraceBuilder { jit_ir::Operand::Var(self.jit_mod.last_inst_idx()), ); self.jit_mod - .push_loop_start_var(jit_ir::Operand::Var(self.jit_mod.last_inst_idx())); + .push_body_start_var(jit_ir::Operand::Var(self.jit_mod.last_inst_idx())); } - self.jit_mod.push(jit_ir::Inst::TraceLoopStart)?; + self.jit_mod.push(jit_ir::Inst::TraceHeaderStart)?; Ok(()) } @@ -1389,19 +1389,19 @@ impl TraceBuilder { for idx in 0..safepoint.lives.len() { let aot_op = &safepoint.lives[idx]; let jit_op = &self.local_map[&aot_op.to_inst_id()]; - self.jit_mod.push_loop_jump_var(jit_op.clone()); + self.jit_mod.push_header_end_var(jit_op.clone()); } self.jit_mod.set_root_jump_addr(sti.unwrap().root_addr.0); - self.jit_mod.push(jit_ir::Inst::RootJump)?; + self.jit_mod.push(jit_ir::Inst::SidetraceEnd)?; } else { // For normal traces insert a jump back to the loop start. let safepoint = cpcall.safepoint().unwrap(); for idx in 0..safepoint.lives.len() { let aot_op = &safepoint.lives[idx]; let jit_op = &self.local_map[&aot_op.to_inst_id()]; - self.jit_mod.push_loop_jump_var(jit_op.clone()); + self.jit_mod.push_header_end_var(jit_op.clone()); } - self.jit_mod.push(jit_ir::Inst::TraceLoopJump)?; + self.jit_mod.push(jit_ir::Inst::TraceHeaderEnd)?; } Ok(self.jit_mod)