diff --git a/Cargo.lock b/Cargo.lock index cf2c4bf79bbf..804617b3f91a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -839,6 +839,7 @@ dependencies = [ "log", "pretty_env_logger", "rayon", + "regalloc2", "serde", "similar", "target-lexicon", diff --git a/cranelift/Cargo.toml b/cranelift/Cargo.toml index fdbd891797b1..330ed71d4a11 100644 --- a/cranelift/Cargo.toml +++ b/cranelift/Cargo.toml @@ -49,6 +49,9 @@ similar = { workspace = true } toml = { workspace = true } serde = { workspace = true } fxhash = "0.2.1" +# Note that this just enables `trace-log` for `clif-util` and doesn't turn it on +# for all of Cranelift, which would be bad. +regalloc2 = { workspace = true, features = ["trace-log"] } [features] default = ["disas", "wasm", "cranelift-codegen/all-arch", "cranelift-codegen/trace-log", "souper-harvest"] diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index ccdcd3061dbd..4a6dc9efd71d 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -98,6 +98,10 @@ impl ABIMachineSpec for AArch64MachineDeps { where I: IntoIterator, { + if call_conv == isa::CallConv::Tail { + return compute_arg_locs_tail(params, add_ret_area_ptr, args); + } + let is_apple_cc = call_conv.extends_apple_aarch64(); // See AArch64 ABI (https://github.com/ARM-software/abi-aa/blob/2021Q1/aapcs64/aapcs64.rst#64parameter-passing), sections 6.4. @@ -896,7 +900,7 @@ impl ABIMachineSpec for AArch64MachineDeps { } fn gen_clobber_restore( - _call_conv: isa::CallConv, + call_conv: isa::CallConv, sig: &Signature, flags: &settings::Flags, clobbers: &[Writable], @@ -904,7 +908,8 @@ impl ABIMachineSpec for AArch64MachineDeps { _outgoing_args_size: u32, ) -> SmallVec<[Inst; 16]> { let mut insts = SmallVec::new(); - let (clobbered_int, clobbered_vec) = get_regs_restored_in_epilogue(flags, sig, clobbers); + let (clobbered_int, clobbered_vec) = + get_regs_restored_in_epilogue(call_conv, flags, sig, clobbers); // Free the fixed frame if necessary. if fixed_frame_storage_size > 0 { @@ -1107,8 +1112,12 @@ impl ABIMachineSpec for AArch64MachineDeps { s.nominal_sp_to_fp } - fn get_regs_clobbered_by_call(_call_conv_of_callee: isa::CallConv) -> PRegSet { - DEFAULT_AAPCS_CLOBBERS + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> PRegSet { + if call_conv_of_callee == isa::CallConv::Tail { + TAIL_CLOBBERS + } else { + DEFAULT_AAPCS_CLOBBERS + } } fn get_ext_mode( @@ -1119,7 +1128,7 @@ impl ABIMachineSpec for AArch64MachineDeps { } fn get_clobbered_callee_saves( - _call_conv: isa::CallConv, + call_conv: isa::CallConv, flags: &settings::Flags, sig: &Signature, regs: &[Writable], @@ -1127,7 +1136,9 @@ impl ABIMachineSpec for AArch64MachineDeps { let mut regs: Vec> = regs .iter() .cloned() - .filter(|r| is_reg_saved_in_prologue(flags.enable_pinned_reg(), sig, r.to_reg())) + .filter(|r| { + is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, r.to_reg()) + }) .collect(); // Sort registers for deterministic code output. We can do an unstable @@ -1151,6 +1162,127 @@ impl ABIMachineSpec for AArch64MachineDeps { } } +fn compute_arg_locs_tail<'a, I>( + params: I, + add_ret_area_ptr: bool, + mut args: ArgsAccumulator<'_>, +) -> CodegenResult<(u32, Option)> +where + I: IntoIterator, +{ + let mut xregs = TAIL_CLOBBERS + .into_iter() + .filter(|r| r.class() == RegClass::Int) + // We reserve `x0` for the return area pointer. For simplicity, we + // reserve it even when there is no return area pointer needed. This + // also means that identity functions don't have to shuffle arguments to + // different return registers because we shifted all argument register + // numbers down by one to make space for the return area pointer. + // + // Also, we cannot use all allocatable GPRs as arguments because we need + // at least one allocatable register for holding the callee address in + // indirect calls. So skip `x1` also, reserving it for that role. + .skip(2); + + let mut vregs = TAIL_CLOBBERS + .into_iter() + .filter(|r| r.class() == RegClass::Float); + + let mut next_stack: u32 = 0; + + // Get the next stack slot for the given type. + let stack = |next_stack: &mut u32, ty: ir::Type| { + *next_stack = align_to(*next_stack, ty.bytes()); + let offset = i64::from(*next_stack); + *next_stack += ty.bytes(); + ABIArgSlot::Stack { + offset, + ty, + extension: ir::ArgumentExtension::None, + } + }; + + // Get the next `x` register available, or a stack slot if all are in use. + let mut xreg = |next_stack: &mut u32, ty| { + xregs + .next() + .map(|reg| ABIArgSlot::Reg { + reg: reg.into(), + ty, + extension: ir::ArgumentExtension::None, + }) + .unwrap_or_else(|| stack(next_stack, ty)) + }; + + // Get the next `v` register available, or a stack slot if all are in use. + let mut vreg = |next_stack: &mut u32, ty| { + vregs + .next() + .map(|reg| ABIArgSlot::Reg { + reg: reg.into(), + ty, + extension: ir::ArgumentExtension::None, + }) + .unwrap_or_else(|| stack(next_stack, ty)) + }; + + for param in params { + assert!( + legal_type_for_machine(param.value_type), + "Invalid type for AArch64: {:?}", + param.value_type + ); + + match param.purpose { + ir::ArgumentPurpose::Normal | ir::ArgumentPurpose::VMContext => {} + ir::ArgumentPurpose::StructArgument(_) + | ir::ArgumentPurpose::StructReturn + | ir::ArgumentPurpose::StackLimit => unimplemented!( + "support for {:?} parameters is not implemented for the `tail` \ + calling convention yet", + param.purpose, + ), + } + + let (reg_classes, reg_types) = Inst::rc_for_type(param.value_type)?; + args.push(ABIArg::Slots { + slots: reg_classes + .iter() + .zip(reg_types) + .map(|(cls, ty)| match cls { + RegClass::Int => xreg(&mut next_stack, *ty), + RegClass::Float => vreg(&mut next_stack, *ty), + RegClass::Vector => unreachable!(), + }) + .collect(), + purpose: param.purpose, + }); + } + + let ret_ptr = if add_ret_area_ptr { + let idx = args.args().len(); + args.push(ABIArg::reg( + xreg_preg(0).into(), + types::I64, + ir::ArgumentExtension::None, + ir::ArgumentPurpose::Normal, + )); + Some(idx) + } else { + None + }; + + next_stack = align_to(next_stack, 16); + + // To avoid overflow issues, limit the arg/return size to something + // reasonable -- here, 128 MB. + if next_stack > STACK_ARG_RET_SIZE_LIMIT { + return Err(CodegenError::ImplLimitExceeded); + } + + Ok((next_stack, ret_ptr)) +} + /// Is this type supposed to be seen on this machine? E.g. references of the /// wrong width are invalid. fn legal_type_for_machine(ty: Type) -> bool { @@ -1162,7 +1294,16 @@ fn legal_type_for_machine(ty: Type) -> bool { /// Is the given register saved in the prologue if clobbered, i.e., is it a /// callee-save? -fn is_reg_saved_in_prologue(enable_pinned_reg: bool, sig: &Signature, r: RealReg) -> bool { +fn is_reg_saved_in_prologue( + call_conv: isa::CallConv, + enable_pinned_reg: bool, + sig: &Signature, + r: RealReg, +) -> bool { + if call_conv == isa::CallConv::Tail { + return false; + } + // FIXME: We need to inspect whether a function is returning Z or P regs too. let save_z_regs = sig .params @@ -1204,6 +1345,7 @@ fn is_reg_saved_in_prologue(enable_pinned_reg: bool, sig: &Signature, r: RealReg /// prologue and restored in the epilogue, given the set of all registers /// written by the function's body. fn get_regs_restored_in_epilogue( + call_conv: isa::CallConv, flags: &settings::Flags, sig: &Signature, regs: &[Writable], @@ -1211,7 +1353,7 @@ fn get_regs_restored_in_epilogue( let mut int_saves = vec![]; let mut vec_saves = vec![]; for ® in regs { - if is_reg_saved_in_prologue(flags.enable_pinned_reg(), sig, reg.to_reg()) { + if is_reg_saved_in_prologue(call_conv, flags.enable_pinned_reg(), sig, reg.to_reg()) { match reg.to_reg().class() { RegClass::Int => int_saves.push(reg), RegClass::Float => vec_saves.push(reg), @@ -1297,3 +1439,69 @@ const fn default_aapcs_clobbers() -> PRegSet { } const DEFAULT_AAPCS_CLOBBERS: PRegSet = default_aapcs_clobbers(); + +// NB: The `tail` calling convention clobbers all allocatable registers. +const TAIL_CLOBBERS: PRegSet = PRegSet::empty() + .with(xreg_preg(0)) + .with(xreg_preg(1)) + .with(xreg_preg(2)) + .with(xreg_preg(3)) + .with(xreg_preg(4)) + .with(xreg_preg(5)) + .with(xreg_preg(6)) + .with(xreg_preg(7)) + .with(xreg_preg(8)) + .with(xreg_preg(9)) + .with(xreg_preg(10)) + .with(xreg_preg(11)) + .with(xreg_preg(12)) + .with(xreg_preg(13)) + .with(xreg_preg(14)) + .with(xreg_preg(15)) + // Cranelift reserves x16 and x17 as unallocatable scratch registers. + // + // x18 can be used by the platform and therefore is not allocatable. + .with(xreg_preg(19)) + .with(xreg_preg(20)) + .with(xreg_preg(21)) + .with(xreg_preg(22)) + .with(xreg_preg(23)) + .with(xreg_preg(24)) + .with(xreg_preg(25)) + .with(xreg_preg(26)) + .with(xreg_preg(27)) + .with(xreg_preg(28)) + // NB: x29 is the FP, x30 is the link register, and x31 is the SP. None of + // these are allocatable. + .with(vreg_preg(0)) + .with(vreg_preg(1)) + .with(vreg_preg(2)) + .with(vreg_preg(3)) + .with(vreg_preg(4)) + .with(vreg_preg(5)) + .with(vreg_preg(6)) + .with(vreg_preg(7)) + .with(vreg_preg(8)) + .with(vreg_preg(9)) + .with(vreg_preg(10)) + .with(vreg_preg(11)) + .with(vreg_preg(12)) + .with(vreg_preg(13)) + .with(vreg_preg(14)) + .with(vreg_preg(15)) + .with(vreg_preg(16)) + .with(vreg_preg(17)) + .with(vreg_preg(18)) + .with(vreg_preg(19)) + .with(vreg_preg(20)) + .with(vreg_preg(21)) + .with(vreg_preg(22)) + .with(vreg_preg(23)) + .with(vreg_preg(24)) + .with(vreg_preg(25)) + .with(vreg_preg(26)) + .with(vreg_preg(27)) + .with(vreg_preg(28)) + .with(vreg_preg(29)) + .with(vreg_preg(30)) + .with(vreg_preg(31)); diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs index fa538a5da84c..5eb0145cda87 100644 --- a/cranelift/codegen/src/isa/riscv64/abi.rs +++ b/cranelift/codegen/src/isa/riscv64/abi.rs @@ -103,11 +103,13 @@ impl ABIMachineSpec for Riscv64MachineDeps { { // All registers that can be used as parameters or rets. // both start and end are included. - let (x_start, x_end, f_start, f_end) = if args_or_rets == ArgsOrRets::Args { - (10, 17, 10, 17) - } else { - let end = if call_conv.extends_wasmtime() { 10 } else { 11 }; - (10, end, 10, end) + let (x_start, x_end, f_start, f_end) = match (call_conv, args_or_rets) { + (isa::CallConv::Tail, _) => (9, 29, 0, 31), + (_, ArgsOrRets::Args) => (10, 17, 10, 17), + (_, ArgsOrRets::Rets) => { + let end = if call_conv.extends_wasmtime() { 10 } else { 11 }; + (10, end, 10, end) + } }; let mut next_x_reg = x_start; let mut next_f_reg = f_start; @@ -215,13 +217,16 @@ impl ABIMachineSpec for Riscv64MachineDeps { } else { None }; + next_stack = align_to(next_stack, Self::stack_align(call_conv)); + // To avoid overflow issues, limit the arg/return size to something // reasonable -- here, 128 MB. if next_stack > STACK_ARG_RET_SIZE_LIMIT { return Err(CodegenError::ImplLimitExceeded); } - CodegenResult::Ok((next_stack, pos)) + + Ok((next_stack, pos)) } fn fp_to_arg_offset(_call_conv: isa::CallConv, _flags: &settings::Flags) -> i64 { @@ -643,8 +648,12 @@ impl ABIMachineSpec for Riscv64MachineDeps { s.nominal_sp_to_fp } - fn get_regs_clobbered_by_call(_call_conv_of_callee: isa::CallConv) -> PRegSet { - CLOBBERS + fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> PRegSet { + if call_conv_of_callee == isa::CallConv::Tail { + TAIL_CLOBBERS + } else { + DEFAULT_CLOBBERS + } } fn get_clobbered_callee_saves( @@ -706,7 +715,11 @@ const CALLEE_SAVE_F_REG: [bool; 32] = [ /// This should be the registers that must be saved by callee. #[inline] -fn is_reg_saved_in_prologue(_conv: CallConv, reg: RealReg) -> bool { +fn is_reg_saved_in_prologue(conv: CallConv, reg: RealReg) -> bool { + if conv == CallConv::Tail { + return false; + } + match reg.class() { RegClass::Int => CALLEE_SAVE_X_REG[reg.hw_enc() as usize], RegClass::Float => CALLEE_SAVE_F_REG[reg.hw_enc() as usize], @@ -731,7 +744,7 @@ fn compute_clobber_size(clobbers: &[Writable]) -> u32 { align_to(clobbered_size, 16) } -const fn clobbers() -> PRegSet { +const fn default_clobbers() -> PRegSet { PRegSet::empty() .with(px_reg(1)) .with(px_reg(5)) @@ -806,7 +819,113 @@ const fn clobbers() -> PRegSet { .with(pv_reg(31)) } -const CLOBBERS: PRegSet = clobbers(); +const DEFAULT_CLOBBERS: PRegSet = default_clobbers(); + +// All allocatable registers are clobbered by calls using the `tail` calling +// convention. +const fn tail_clobbers() -> PRegSet { + PRegSet::empty() + // `x0` is the zero register, and not allocatable. + .with(px_reg(1)) + // `x2` is the stack pointer, `x3` is the global pointer, and `x4` is + // the thread pointer. None are allocatable. + .with(px_reg(5)) + .with(px_reg(6)) + .with(px_reg(7)) + // `x8` is the frame pointer, and not allocatable. + .with(px_reg(9)) + .with(px_reg(10)) + .with(px_reg(10)) + .with(px_reg(11)) + .with(px_reg(12)) + .with(px_reg(13)) + .with(px_reg(14)) + .with(px_reg(15)) + .with(px_reg(16)) + .with(px_reg(17)) + .with(px_reg(18)) + .with(px_reg(19)) + .with(px_reg(20)) + .with(px_reg(21)) + .with(px_reg(22)) + .with(px_reg(23)) + .with(px_reg(24)) + .with(px_reg(25)) + .with(px_reg(26)) + .with(px_reg(27)) + .with(px_reg(28)) + .with(px_reg(29)) + // `x30` and `x31` are reserved as scratch registers, and are not + // allocatable. + // + // F Regs + .with(pf_reg(0)) + .with(pf_reg(1)) + .with(pf_reg(2)) + .with(pf_reg(3)) + .with(pf_reg(4)) + .with(pf_reg(5)) + .with(pf_reg(6)) + .with(pf_reg(7)) + .with(pf_reg(9)) + .with(pf_reg(10)) + .with(pf_reg(11)) + .with(pf_reg(12)) + .with(pf_reg(13)) + .with(pf_reg(14)) + .with(pf_reg(15)) + .with(pf_reg(16)) + .with(pf_reg(17)) + .with(pf_reg(18)) + .with(pf_reg(19)) + .with(pf_reg(20)) + .with(pf_reg(21)) + .with(pf_reg(22)) + .with(pf_reg(23)) + .with(pf_reg(24)) + .with(pf_reg(25)) + .with(pf_reg(26)) + .with(pf_reg(27)) + .with(pf_reg(28)) + .with(pf_reg(29)) + .with(pf_reg(30)) + .with(pf_reg(31)) + // V Regs + .with(pv_reg(0)) + .with(pv_reg(1)) + .with(pv_reg(2)) + .with(pv_reg(3)) + .with(pv_reg(4)) + .with(pv_reg(5)) + .with(pv_reg(6)) + .with(pv_reg(7)) + .with(pv_reg(8)) + .with(pv_reg(9)) + .with(pv_reg(10)) + .with(pv_reg(11)) + .with(pv_reg(12)) + .with(pv_reg(13)) + .with(pv_reg(14)) + .with(pv_reg(15)) + .with(pv_reg(16)) + .with(pv_reg(17)) + .with(pv_reg(18)) + .with(pv_reg(19)) + .with(pv_reg(20)) + .with(pv_reg(21)) + .with(pv_reg(22)) + .with(pv_reg(23)) + .with(pv_reg(24)) + .with(pv_reg(25)) + .with(pv_reg(26)) + .with(pv_reg(27)) + .with(pv_reg(28)) + .with(pv_reg(29)) + .with(pv_reg(30)) + .with(pv_reg(31)) +} + +const TAIL_CLOBBERS: PRegSet = tail_clobbers(); impl Riscv64MachineDeps { fn gen_probestack_unroll(insts: &mut SmallInstVec, guard_size: u32, probe_count: u32) { diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 28b2f7c4dfaf..96f0e2eb7992 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -458,13 +458,17 @@ impl ABIMachineSpec for X64ABIMachineSpec { } fn gen_clobber_save( - _call_conv: isa::CallConv, + call_conv: isa::CallConv, setup_frame: bool, flags: &settings::Flags, clobbered_callee_saves: &[Writable], fixed_frame_storage_size: u32, _outgoing_args_size: u32, ) -> (u64, SmallVec<[Self::I; 16]>) { + if call_conv == isa::CallConv::Tail { + assert!(clobbered_callee_saves.is_empty()); + } + let mut insts = SmallVec::new(); let clobbered_size = compute_clobber_size(&clobbered_callee_saves); @@ -710,7 +714,9 @@ impl ABIMachineSpec for X64ABIMachineSpec { } fn get_regs_clobbered_by_call(call_conv_of_callee: isa::CallConv) -> PRegSet { - if call_conv_of_callee.extends_windows_fastcall() { + if call_conv_of_callee == isa::CallConv::Tail { + TAIL_CLOBBERS + } else if call_conv_of_callee.extends_windows_fastcall() { WINDOWS_CLOBBERS } else { SYSV_CLOBBERS @@ -731,11 +737,10 @@ impl ABIMachineSpec for X64ABIMachineSpec { regs: &[Writable], ) -> Vec> { let mut regs: Vec> = match call_conv { - CallConv::Tail - | CallConv::Fast - | CallConv::Cold - | CallConv::SystemV - | CallConv::WasmtimeSystemV => regs + // The `tail` calling convention doesn't have any callee-save + // registers. + CallConv::Tail => vec![], + CallConv::Fast | CallConv::Cold | CallConv::SystemV | CallConv::WasmtimeSystemV => regs .iter() .cloned() .filter(|r| is_callee_save_systemv(r.to_reg(), flags.enable_pinned_reg())) @@ -803,6 +808,26 @@ impl From for SyntheticAmode { fn get_intreg_for_arg(call_conv: &CallConv, idx: usize, arg_idx: usize) -> Option { let is_fastcall = call_conv.extends_windows_fastcall(); + if *call_conv == isa::CallConv::Tail { + return match idx { + 0 => Some(regs::rax()), + 1 => Some(regs::rcx()), + 2 => Some(regs::rdx()), + 3 => Some(regs::rbx()), + 4 => Some(regs::rsi()), + 5 => Some(regs::rdi()), + 6 => Some(regs::r8()), + 7 => Some(regs::r9()), + 8 => Some(regs::r10()), + 9 => Some(regs::r11()), + 10 => Some(regs::r12()), + 11 => Some(regs::r13()), + 12 => Some(regs::r14()), + // NB: `r15` is reserved as a scratch register. + _ => None, + }; + } + // Fastcall counts by absolute argument number; SysV counts by argument of // this (integer) class. let i = if is_fastcall { arg_idx } else { idx }; @@ -850,7 +875,24 @@ fn get_intreg_for_retval( retval_idx: usize, ) -> Option { match call_conv { - CallConv::Tail | CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx { + CallConv::Tail => match intreg_idx { + 0 => Some(regs::rax()), + 1 => Some(regs::rcx()), + 2 => Some(regs::rdx()), + 3 => Some(regs::rbx()), + 4 => Some(regs::rsi()), + 5 => Some(regs::rdi()), + 6 => Some(regs::r8()), + 7 => Some(regs::r9()), + 8 => Some(regs::r10()), + 9 => Some(regs::r11()), + 10 => Some(regs::r12()), + 11 => Some(regs::r13()), + 12 => Some(regs::r14()), + // NB: `r15` is reserved as a scratch register. + _ => None, + }, + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match intreg_idx { 0 => Some(regs::rax()), 1 => Some(regs::rdx()), _ => None, @@ -878,7 +920,18 @@ fn get_fltreg_for_retval( retval_idx: usize, ) -> Option { match call_conv { - CallConv::Tail | CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx { + CallConv::Tail => match fltreg_idx { + 0 => Some(regs::xmm0()), + 1 => Some(regs::xmm1()), + 2 => Some(regs::xmm2()), + 3 => Some(regs::xmm3()), + 4 => Some(regs::xmm4()), + 5 => Some(regs::xmm5()), + 6 => Some(regs::xmm6()), + 7 => Some(regs::xmm7()), + _ => None, + }, + CallConv::Fast | CallConv::Cold | CallConv::SystemV => match fltreg_idx { 0 => Some(regs::xmm0()), 1 => Some(regs::xmm1()), _ => None, @@ -951,6 +1004,7 @@ fn compute_clobber_size(clobbers: &[Writable]) -> u32 { const WINDOWS_CLOBBERS: PRegSet = windows_clobbers(); const SYSV_CLOBBERS: PRegSet = sysv_clobbers(); +const TAIL_CLOBBERS: PRegSet = tail_clobbers(); const fn windows_clobbers() -> PRegSet { PRegSet::empty() @@ -997,3 +1051,37 @@ const fn sysv_clobbers() -> PRegSet { .with(regs::fpr_preg(14)) .with(regs::fpr_preg(15)) } + +const fn tail_clobbers() -> PRegSet { + PRegSet::empty() + .with(regs::gpr_preg(regs::ENC_RAX)) + .with(regs::gpr_preg(regs::ENC_RCX)) + .with(regs::gpr_preg(regs::ENC_RDX)) + .with(regs::gpr_preg(regs::ENC_RBX)) + .with(regs::gpr_preg(regs::ENC_RSI)) + .with(regs::gpr_preg(regs::ENC_RDI)) + .with(regs::gpr_preg(regs::ENC_R8)) + .with(regs::gpr_preg(regs::ENC_R9)) + .with(regs::gpr_preg(regs::ENC_R10)) + .with(regs::gpr_preg(regs::ENC_R11)) + .with(regs::gpr_preg(regs::ENC_R12)) + .with(regs::gpr_preg(regs::ENC_R13)) + .with(regs::gpr_preg(regs::ENC_R14)) + .with(regs::gpr_preg(regs::ENC_R15)) + .with(regs::fpr_preg(0)) + .with(regs::fpr_preg(1)) + .with(regs::fpr_preg(2)) + .with(regs::fpr_preg(3)) + .with(regs::fpr_preg(4)) + .with(regs::fpr_preg(5)) + .with(regs::fpr_preg(6)) + .with(regs::fpr_preg(7)) + .with(regs::fpr_preg(8)) + .with(regs::fpr_preg(9)) + .with(regs::fpr_preg(10)) + .with(regs::fpr_preg(11)) + .with(regs::fpr_preg(12)) + .with(regs::fpr_preg(13)) + .with(regs::fpr_preg(14)) + .with(regs::fpr_preg(15)) +} diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 1889f144a976..70e998ef9622 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -388,7 +388,9 @@ pub trait ABIMachineSpec { /// and stack slots. /// /// The argument locations should be pushed onto the given `ArgsAccumulator` - /// in order. + /// in order. Any extra arguments added (such as return area pointers) + /// should come at the end of the list so that the first N lowered + /// parameters align with the N clif parameters. /// /// Returns the stack-space used (rounded up to as alignment requires), and /// if `add_ret_area_ptr` was passed, the index of the extra synthetic arg diff --git a/cranelift/codegen/src/machinst/reg.rs b/cranelift/codegen/src/machinst/reg.rs index 61b804f1f45f..2fa9235de81a 100644 --- a/cranelift/codegen/src/machinst/reg.rs +++ b/cranelift/codegen/src/machinst/reg.rs @@ -419,7 +419,10 @@ impl<'a, F: Fn(VReg) -> VReg> OperandCollector<'a, F> { pub fn reg_fixed_def(&mut self, reg: Writable, rreg: Reg) { debug_assert!(reg.to_reg().is_virtual()); let rreg = rreg.to_real_reg().expect("fixed reg is not a RealReg"); - debug_assert!(self.is_allocatable_preg(rreg.into())); + debug_assert!( + self.is_allocatable_preg(rreg.into()), + "{rreg:?} is not allocatable" + ); self.add_operand(Operand::reg_fixed_def(reg.to_reg().into(), rreg.into())); } diff --git a/cranelift/filetests/filetests/isa/aarch64/tail-call-conv.clif b/cranelift/filetests/filetests/isa/aarch64/tail-call-conv.clif new file mode 100644 index 000000000000..7abba3125a66 --- /dev/null +++ b/cranelift/filetests/filetests/isa/aarch64/tail-call-conv.clif @@ -0,0 +1,463 @@ +test compile precise-output +target aarch64 + +;; Test the `tail` calling convention with non-tail calls and stack arguments. + +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; ldr x9, [fp, #16] +; ldr x2, [fp, #24] +; ldp fp, lr, [sp], #16 +; add sp, sp, #16 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; ldur x9, [x29, #0x10] +; ldur x2, [x29, #0x18] +; ldp x29, x30, [sp], #0x10 +; add sp, sp, #0x10 +; ret + +function %tail_caller_stack_args() -> i64 tail { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v26 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; movz x2, #10 +; movz x3, #15 +; movz x4, #20 +; movz x5, #25 +; movz x6, #30 +; movz x7, #35 +; movz x8, #40 +; movz x9, #45 +; movz x10, #50 +; movz x11, #55 +; movz x12, #60 +; movz x13, #65 +; movz x14, #70 +; movz x15, #75 +; movz x19, #80 +; movz x20, #85 +; movz x21, #90 +; movz x22, #95 +; movz x23, #100 +; movz x24, #105 +; movz x25, #110 +; movz x26, #115 +; movz x27, #120 +; movz x28, #125 +; movz x0, #130 +; movz x1, #135 +; sub sp, sp, #16 +; virtual_sp_offset_adjust 16 +; str x0, [sp] +; str x1, [sp, #8] +; load_ext_name x0, TestCase(%tail_callee_stack_args)+0 +; blr x0 +; ldp fp, lr, [sp], #16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; mov x2, #0xa +; mov x3, #0xf +; mov x4, #0x14 +; mov x5, #0x19 +; mov x6, #0x1e +; mov x7, #0x23 +; mov x8, #0x28 +; mov x9, #0x2d +; mov x10, #0x32 +; mov x11, #0x37 +; mov x12, #0x3c +; mov x13, #0x41 +; mov x14, #0x46 +; mov x15, #0x4b +; mov x19, #0x50 +; mov x20, #0x55 +; mov x21, #0x5a +; mov x22, #0x5f +; mov x23, #0x64 +; mov x24, #0x69 +; mov x25, #0x6e +; mov x26, #0x73 +; mov x27, #0x78 +; mov x28, #0x7d +; mov x0, #0x82 +; mov x1, #0x87 +; sub sp, sp, #0x10 +; stur x0, [sp] +; stur x1, [sp, #8] +; ldr x0, #0x84 +; b #0x8c +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; blr x0 +; ldp x29, x30, [sp], #0x10 +; ret + +;; Test the `tail` calling convention with non-tail calls and stack returns. + +function %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; sub sp, sp, #16 +; block0: +; movz x2, #10 +; str x2, [sp] +; movz x3, #15 +; movz x4, #20 +; movz x5, #25 +; movz x6, #30 +; movz x7, #35 +; movz x8, #40 +; movz x9, #45 +; movz x10, #50 +; movz x11, #55 +; movz x12, #60 +; movz x13, #65 +; movz x14, #70 +; movz x15, #75 +; movz x19, #80 +; movz x20, #85 +; movz x21, #90 +; movz x22, #95 +; movz x23, #100 +; movz x24, #105 +; movz x25, #110 +; movz x26, #115 +; movz x27, #120 +; movz x28, #125 +; movz x1, #130 +; movz x2, #135 +; str x1, [x0] +; str x2, [x0, #8] +; ldr x2, [sp] +; add sp, sp, #16 +; ldp fp, lr, [sp], #16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; sub sp, sp, #0x10 +; block1: ; offset 0xc +; mov x2, #0xa +; stur x2, [sp] +; mov x3, #0xf +; mov x4, #0x14 +; mov x5, #0x19 +; mov x6, #0x1e +; mov x7, #0x23 +; mov x8, #0x28 +; mov x9, #0x2d +; mov x10, #0x32 +; mov x11, #0x37 +; mov x12, #0x3c +; mov x13, #0x41 +; mov x14, #0x46 +; mov x15, #0x4b +; mov x19, #0x50 +; mov x20, #0x55 +; mov x21, #0x5a +; mov x22, #0x5f +; mov x23, #0x64 +; mov x24, #0x69 +; mov x25, #0x6e +; mov x26, #0x73 +; mov x27, #0x78 +; mov x28, #0x7d +; mov x1, #0x82 +; mov x2, #0x87 +; stur x1, [x0] +; stur x2, [x0, #8] +; ldur x2, [sp] +; add sp, sp, #0x10 +; ldp x29, x30, [sp], #0x10 +; ret + +function %tail_caller_stack_rets() -> i64 tail { + fn0 = %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 = call fn0() + return v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; sub sp, sp, #16 +; virtual_sp_offset_adjust 16 +; mov x0, sp +; load_ext_name x14, TestCase(%tail_callee_stack_rets)+0 +; blr x14 +; ldr x13, [sp] +; ldr x2, [sp, #8] +; add sp, sp, #16 +; virtual_sp_offset_adjust -16 +; ldp fp, lr, [sp], #16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; sub sp, sp, #0x10 +; mov x0, sp +; ldr x14, #0x18 +; b #0x20 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_rets 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; blr x14 +; ldur x13, [sp] +; ldur x2, [sp, #8] +; add sp, sp, #0x10 +; ldp x29, x30, [sp], #0x10 +; ret + +;; Test the `tail` calling convention with non-tail calls and both stack +;; arguments and stack returns. + +function %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; sub sp, sp, #16 +; block0: +; str x9, [sp] +; ldr x9, [fp, #16] +; ldr x1, [fp, #24] +; str x9, [x0] +; str x1, [x0, #8] +; ldr x9, [sp] +; add sp, sp, #16 +; ldp fp, lr, [sp], #16 +; add sp, sp, #16 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; sub sp, sp, #0x10 +; block1: ; offset 0xc +; stur x9, [sp] +; ldur x9, [x29, #0x10] +; ldur x1, [x29, #0x18] +; stur x9, [x0] +; stur x1, [x0, #8] +; ldur x9, [sp] +; add sp, sp, #0x10 +; ldp x29, x30, [sp], #0x10 +; add sp, sp, #0x10 +; ret + +function %tail_caller_stack_args_and_rets() -> i64 tail { + fn0 = %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v51 +} + +; VCode: +; stp fp, lr, [sp, #-16]! +; mov fp, sp +; block0: +; movz x2, #10 +; movz x3, #15 +; movz x4, #20 +; movz x5, #25 +; movz x6, #30 +; movz x7, #35 +; movz x8, #40 +; movz x9, #45 +; movz x10, #50 +; movz x11, #55 +; movz x12, #60 +; movz x13, #65 +; movz x14, #70 +; movz x15, #75 +; movz x19, #80 +; movz x20, #85 +; movz x21, #90 +; movz x22, #95 +; movz x23, #100 +; movz x24, #105 +; movz x25, #110 +; movz x26, #115 +; movz x27, #120 +; movz x28, #125 +; movz x0, #130 +; movz x1, #135 +; sub sp, sp, #32 +; virtual_sp_offset_adjust 32 +; str x0, [sp] +; str x1, [sp, #8] +; add x0, sp, #16 +; load_ext_name x1, TestCase(%tail_callee_stack_args_and_rets)+0 +; blr x1 +; ldr x9, [sp] +; ldr x2, [sp, #8] +; add sp, sp, #16 +; virtual_sp_offset_adjust -16 +; ldp fp, lr, [sp], #16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; stp x29, x30, [sp, #-0x10]! +; mov x29, sp +; block1: ; offset 0x8 +; mov x2, #0xa +; mov x3, #0xf +; mov x4, #0x14 +; mov x5, #0x19 +; mov x6, #0x1e +; mov x7, #0x23 +; mov x8, #0x28 +; mov x9, #0x2d +; mov x10, #0x32 +; mov x11, #0x37 +; mov x12, #0x3c +; mov x13, #0x41 +; mov x14, #0x46 +; mov x15, #0x4b +; mov x19, #0x50 +; mov x20, #0x55 +; mov x21, #0x5a +; mov x22, #0x5f +; mov x23, #0x64 +; mov x24, #0x69 +; mov x25, #0x6e +; mov x26, #0x73 +; mov x27, #0x78 +; mov x28, #0x7d +; mov x0, #0x82 +; mov x1, #0x87 +; sub sp, sp, #0x20 +; stur x0, [sp] +; stur x1, [sp, #8] +; add x0, sp, #0x10 +; ldr x1, #0x88 +; b #0x90 +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args_and_rets 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; blr x1 +; ldur x9, [sp] +; ldur x2, [sp, #8] +; add sp, sp, #0x10 +; ldp x29, x30, [sp], #0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/tail-call-conv.clif b/cranelift/filetests/filetests/isa/riscv64/tail-call-conv.clif new file mode 100644 index 000000000000..6693b2bf0177 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/tail-call-conv.clif @@ -0,0 +1,606 @@ +test compile precise-output +target riscv64 + +;; Test the `tail` calling convention with non-tail calls and stack arguments. + +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; ld a6,16(fp) +; ld t3,24(fp) +; ld t0,32(fp) +; ld t2,40(fp) +; ld s1,48(fp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; add sp, sp, #48 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; ld a6, 0x10(s0) +; ld t3, 0x18(s0) +; ld t0, 0x20(s0) +; ld t2, 0x28(s0) +; ld s1, 0x30(s0) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; addi sp, sp, 0x30 +; ret + +function %tail_caller_stack_args() -> i64 tail { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v26 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-16 +; block0: +; li s1,10 +; sd s1,8(nominal_sp) +; li a0,15 +; sd a0,0(nominal_sp) +; li a1,20 +; li a2,25 +; li a3,30 +; li a4,35 +; li a5,40 +; li a6,45 +; li a7,50 +; li s2,55 +; li s3,60 +; li s4,65 +; li s5,70 +; li s6,75 +; li s7,80 +; li s8,85 +; li s9,90 +; li s10,95 +; li s11,100 +; li t3,105 +; li t4,110 +; li t1,115 +; li t2,120 +; li t0,125 +; li s1,130 +; li a0,135 +; add sp,-48 +; virtual_sp_offset_adj +48 +; sd t1,0(sp) +; sd t2,8(sp) +; sd t0,16(sp) +; sd s1,24(sp) +; sd a0,32(sp) +; load_sym t0,%tail_callee_stack_args+0 +; ld a0,0(nominal_sp) +; ld s1,8(nominal_sp) +; callind t0 +; add sp,+16 +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x10 +; block1: ; offset 0x14 +; addi s1, zero, 0xa +; sd s1, 8(sp) +; addi a0, zero, 0xf +; sd a0, 0(sp) +; addi a1, zero, 0x14 +; addi a2, zero, 0x19 +; addi a3, zero, 0x1e +; addi a4, zero, 0x23 +; addi a5, zero, 0x28 +; addi a6, zero, 0x2d +; addi a7, zero, 0x32 +; addi s2, zero, 0x37 +; addi s3, zero, 0x3c +; addi s4, zero, 0x41 +; addi s5, zero, 0x46 +; addi s6, zero, 0x4b +; addi s7, zero, 0x50 +; addi s8, zero, 0x55 +; addi s9, zero, 0x5a +; addi s10, zero, 0x5f +; addi s11, zero, 0x64 +; addi t3, zero, 0x69 +; addi t4, zero, 0x6e +; addi t1, zero, 0x73 +; addi t2, zero, 0x78 +; addi t0, zero, 0x7d +; addi s1, zero, 0x82 +; addi a0, zero, 0x87 +; addi sp, sp, -0x30 +; sd t1, 0(sp) +; sd t2, 8(sp) +; sd t0, 0x10(sp) +; sd s1, 0x18(sp) +; sd a0, 0x20(sp) +; auipc t0, 0 +; ld t0, 0xc(t0) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld a0, 0x30(sp) +; ld s1, 0x38(sp) +; jalr t0 +; addi sp, sp, 0x10 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +;; Test the `tail` calling convention with non-tail calls and stack returns. + +function %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-32 +; block0: +; li t1,10 +; sd t1,16(nominal_sp) +; li a0,15 +; sd a0,8(nominal_sp) +; li a1,20 +; sd a1,0(nominal_sp) +; li a2,25 +; li a3,30 +; li a4,35 +; li a5,40 +; li a6,45 +; li a7,50 +; li s2,55 +; li s3,60 +; li s4,65 +; li s5,70 +; li s6,75 +; li s7,80 +; li s8,85 +; li s9,90 +; li s10,95 +; li s11,100 +; li t3,105 +; li t4,110 +; li t1,115 +; li t2,120 +; li t0,125 +; li a0,130 +; li a1,135 +; sd t1,0(s1) +; sd t2,8(s1) +; sd t0,16(s1) +; sd a0,24(s1) +; sd a1,32(s1) +; ld a1,0(nominal_sp) +; ld a0,8(nominal_sp) +; ld s1,16(nominal_sp) +; add sp,+32 +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x20 +; block1: ; offset 0x14 +; addi t1, zero, 0xa +; sd t1, 0x10(sp) +; addi a0, zero, 0xf +; sd a0, 8(sp) +; addi a1, zero, 0x14 +; sd a1, 0(sp) +; addi a2, zero, 0x19 +; addi a3, zero, 0x1e +; addi a4, zero, 0x23 +; addi a5, zero, 0x28 +; addi a6, zero, 0x2d +; addi a7, zero, 0x32 +; addi s2, zero, 0x37 +; addi s3, zero, 0x3c +; addi s4, zero, 0x41 +; addi s5, zero, 0x46 +; addi s6, zero, 0x4b +; addi s7, zero, 0x50 +; addi s8, zero, 0x55 +; addi s9, zero, 0x5a +; addi s10, zero, 0x5f +; addi s11, zero, 0x64 +; addi t3, zero, 0x69 +; addi t4, zero, 0x6e +; addi t1, zero, 0x73 +; addi t2, zero, 0x78 +; addi t0, zero, 0x7d +; addi a0, zero, 0x82 +; addi a1, zero, 0x87 +; sd t1, 0(s1) +; sd t2, 8(s1) +; sd t0, 0x10(s1) +; sd a0, 0x18(s1) +; sd a1, 0x20(s1) +; ld a1, 0(sp) +; ld a0, 8(sp) +; ld s1, 0x10(sp) +; addi sp, sp, 0x20 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %tail_caller_stack_rets() -> i64 tail { + fn0 = %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 = call fn0() + return v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; add sp,-48 +; virtual_sp_offset_adj +48 +; load_addr s1,0(sp) +; load_sym a1,%tail_callee_stack_rets+0 +; callind a1 +; ld t0,0(sp) +; ld t2,8(sp) +; ld a1,16(sp) +; ld a3,24(sp) +; ld s1,32(sp) +; add sp,+48 +; virtual_sp_offset_adj -48 +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; addi sp, sp, -0x30 +; mv s1, sp +; auipc a1, 0 +; ld a1, 0xc(a1) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_rets 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; jalr a1 +; ld t0, 0(sp) +; ld t2, 8(sp) +; ld a1, 0x10(sp) +; ld a3, 0x18(sp) +; ld s1, 0x20(sp) +; addi sp, sp, 0x30 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +;; Test the `tail` calling convention with non-tail calls and both stack +;; arguments and stack returns. + +function %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-32 +; block0: +; sd a3,0(nominal_sp) +; sd a6,8(nominal_sp) +; sd t3,16(nominal_sp) +; ld a6,16(fp) +; ld t3,24(fp) +; ld t1,32(fp) +; ld t2,40(fp) +; ld t0,48(fp) +; ld a3,56(fp) +; sd a6,0(a3) +; sd t3,8(a3) +; sd t1,16(a3) +; sd t2,24(a3) +; sd t0,32(a3) +; ld a3,0(nominal_sp) +; ld a6,8(nominal_sp) +; ld t3,16(nominal_sp) +; add sp,+32 +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; add sp, sp, #48 ; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x20 +; block1: ; offset 0x14 +; sd a3, 0(sp) +; sd a6, 8(sp) +; sd t3, 0x10(sp) +; ld a6, 0x10(s0) +; ld t3, 0x18(s0) +; ld t1, 0x20(s0) +; ld t2, 0x28(s0) +; ld t0, 0x30(s0) +; ld a3, 0x38(s0) +; sd a6, 0(a3) +; sd t3, 8(a3) +; sd t1, 0x10(a3) +; sd t2, 0x18(a3) +; sd t0, 0x20(a3) +; ld a3, 0(sp) +; ld a6, 8(sp) +; ld t3, 0x10(sp) +; addi sp, sp, 0x20 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; addi sp, sp, 0x30 +; ret + +function %tail_caller_stack_args_and_rets() -> i64 tail { + fn0 = %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v51 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; add sp,-16 +; block0: +; li s1,10 +; sd s1,8(nominal_sp) +; li a0,15 +; sd a0,0(nominal_sp) +; li a1,20 +; li a2,25 +; li a3,30 +; li a4,35 +; li a5,40 +; li a6,45 +; li a7,50 +; li s2,55 +; li s3,60 +; li s4,65 +; li s5,70 +; li s6,75 +; li s7,80 +; li s8,85 +; li s9,90 +; li s10,95 +; li s11,100 +; li t3,105 +; li t4,110 +; li t1,115 +; li t2,120 +; li t0,125 +; li s1,130 +; li a0,135 +; add sp,-96 +; virtual_sp_offset_adj +96 +; sd t1,0(sp) +; sd t2,8(sp) +; sd t0,16(sp) +; sd s1,24(sp) +; sd a0,32(sp) +; load_addr t0,48(sp) +; sd t0,40(sp) +; load_sym t0,%tail_callee_stack_args_and_rets+0 +; ld a0,0(nominal_sp) +; ld s1,8(nominal_sp) +; callind t0 +; ld a3,0(sp) +; ld a5,8(sp) +; ld a7,16(sp) +; ld t4,24(sp) +; ld s1,32(sp) +; add sp,+48 +; virtual_sp_offset_adj -48 +; add sp,+16 +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; addi sp, sp, -0x10 +; block1: ; offset 0x14 +; addi s1, zero, 0xa +; sd s1, 8(sp) +; addi a0, zero, 0xf +; sd a0, 0(sp) +; addi a1, zero, 0x14 +; addi a2, zero, 0x19 +; addi a3, zero, 0x1e +; addi a4, zero, 0x23 +; addi a5, zero, 0x28 +; addi a6, zero, 0x2d +; addi a7, zero, 0x32 +; addi s2, zero, 0x37 +; addi s3, zero, 0x3c +; addi s4, zero, 0x41 +; addi s5, zero, 0x46 +; addi s6, zero, 0x4b +; addi s7, zero, 0x50 +; addi s8, zero, 0x55 +; addi s9, zero, 0x5a +; addi s10, zero, 0x5f +; addi s11, zero, 0x64 +; addi t3, zero, 0x69 +; addi t4, zero, 0x6e +; addi t1, zero, 0x73 +; addi t2, zero, 0x78 +; addi t0, zero, 0x7d +; addi s1, zero, 0x82 +; addi a0, zero, 0x87 +; addi sp, sp, -0x60 +; sd t1, 0(sp) +; sd t2, 8(sp) +; sd t0, 0x10(sp) +; sd s1, 0x18(sp) +; sd a0, 0x20(sp) +; addi t0, sp, 0x30 +; sd t0, 0x28(sp) +; auipc t0, 0 +; ld t0, 0xc(t0) +; j 0xc +; .byte 0x00, 0x00, 0x00, 0x00 ; reloc_external Abs8 %tail_callee_stack_args_and_rets 0 +; .byte 0x00, 0x00, 0x00, 0x00 +; ld a0, 0x60(sp) +; ld s1, 0x68(sp) +; jalr t0 +; ld a3, 0(sp) +; ld a5, 8(sp) +; ld a7, 0x10(sp) +; ld t4, 0x18(sp) +; ld s1, 0x20(sp) +; addi sp, sp, 0x30 +; addi sp, sp, 0x10 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/x64/tail-call-conv.clif b/cranelift/filetests/filetests/isa/x64/tail-call-conv.clif new file mode 100644 index 000000000000..03c04601f265 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/tail-call-conv.clif @@ -0,0 +1,741 @@ +test compile precise-output +target x86_64 + +;; Test the `tail` calling convention with non-tail calls and stack arguments. + +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64): + return v14 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; movq 16(%rbp), %r8 +; movq 24(%rbp), %rax +; movq %rbp, %rsp +; popq %rbp +; ret 16 +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; movq 0x10(%rbp), %r8 +; movq 0x18(%rbp), %rax +; movq %rbp, %rsp +; popq %rbp +; retq $0x10 + +function %tail_caller_stack_args() -> i64 { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) + return v15 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $64, %rsp +; movq %rbx, 16(%rsp) +; movq %r12, 24(%rsp) +; movq %r13, 32(%rsp) +; movq %r14, 40(%rsp) +; movq %r15, 48(%rsp) +; block0: +; movl $10, %eax +; movq %rax, rsp(0 + virtual offset) +; movl $15, %ecx +; movl $20, %edx +; movl $25, %ebx +; movl $30, %esi +; movl $35, %edi +; movl $40, %r8d +; movl $45, %r9d +; movl $50, %r10d +; movl $55, %r11d +; movl $60, %r12d +; movl $65, %r13d +; movl $70, %r14d +; movl $75, %r15d +; movl $80, %eax +; subq %rsp, $16, %rsp +; virtual_sp_offset_adjust 16 +; movq %r15, 0(%rsp) +; movq %rax, 8(%rsp) +; load_ext_name %tail_callee_stack_args+0, %r15 +; movq rsp(0 + virtual offset), %rax +; call *%r15 +; movq 16(%rsp), %rbx +; movq 24(%rsp), %r12 +; movq 32(%rsp), %r13 +; movq 40(%rsp), %r14 +; movq 48(%rsp), %r15 +; addq %rsp, $64, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x40, %rsp +; movq %rbx, 0x10(%rsp) +; movq %r12, 0x18(%rsp) +; movq %r13, 0x20(%rsp) +; movq %r14, 0x28(%rsp) +; movq %r15, 0x30(%rsp) +; block1: ; offset 0x21 +; movl $0xa, %eax +; movq %rax, (%rsp) +; movl $0xf, %ecx +; movl $0x14, %edx +; movl $0x19, %ebx +; movl $0x1e, %esi +; movl $0x23, %edi +; movl $0x28, %r8d +; movl $0x2d, %r9d +; movl $0x32, %r10d +; movl $0x37, %r11d +; movl $0x3c, %r12d +; movl $0x41, %r13d +; movl $0x46, %r14d +; movl $0x4b, %r15d +; movl $0x50, %eax +; subq $0x10, %rsp +; movq %r15, (%rsp) +; movq %rax, 8(%rsp) +; movabsq $0, %r15 ; reloc_external Abs8 %tail_callee_stack_args 0 +; movq 0x10(%rsp), %rax +; callq *%r15 +; movq 0x10(%rsp), %rbx +; movq 0x18(%rsp), %r12 +; movq 0x20(%rsp), %r13 +; movq 0x28(%rsp), %r14 +; movq 0x30(%rsp), %r15 +; addq $0x40, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +;; Test the `tail` calling convention with non-tail calls and stack returns. + +function %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $112, %rsp +; block0: +; movl $10, %esi +; movq %rsi, rsp(96 + virtual offset) +; movl $15, %ecx +; movq %rcx, rsp(88 + virtual offset) +; movl $20, %edx +; movq %rdx, rsp(80 + virtual offset) +; movl $25, %ebx +; movq %rbx, rsp(72 + virtual offset) +; movl $30, %esi +; movq %rsi, rsp(64 + virtual offset) +; movl $35, %edi +; movq %rdi, rsp(56 + virtual offset) +; movl $40, %r8d +; movq %r8, rsp(48 + virtual offset) +; movl $45, %r9d +; movq %r9, rsp(40 + virtual offset) +; movl $50, %r10d +; movq %r10, rsp(32 + virtual offset) +; movl $55, %r11d +; movq %r11, rsp(24 + virtual offset) +; movl $60, %r12d +; movq %r12, rsp(16 + virtual offset) +; movl $65, %r13d +; movq %r13, rsp(8 + virtual offset) +; movl $70, %r14d +; movq %r14, rsp(0 + virtual offset) +; movl $75, %r15d +; movl $80, %ecx +; movl $85, %edx +; movl $90, %ebx +; movl $95, %esi +; movl $100, %edi +; movl $105, %r8d +; movl $110, %r9d +; movl $115, %r10d +; movl $120, %r11d +; movl $125, %r12d +; movl $130, %r13d +; movl $135, %r14d +; movq %r15, 0(%rax) +; movq %rcx, 8(%rax) +; movq %rdx, 16(%rax) +; movq %rbx, 24(%rax) +; movq %rsi, 32(%rax) +; movq %rdi, 40(%rax) +; movq %r8, 48(%rax) +; movq %r9, 56(%rax) +; movq %r10, 64(%rax) +; movq %r11, 72(%rax) +; movq %r12, 80(%rax) +; movq %r13, 88(%rax) +; movq %r14, 96(%rax) +; movq rsp(0 + virtual offset), %r14 +; movq rsp(8 + virtual offset), %r13 +; movq rsp(16 + virtual offset), %r12 +; movq rsp(24 + virtual offset), %r11 +; movq rsp(32 + virtual offset), %r10 +; movq rsp(40 + virtual offset), %r9 +; movq rsp(48 + virtual offset), %r8 +; movq rsp(56 + virtual offset), %rdi +; movq rsp(64 + virtual offset), %rsi +; movq rsp(72 + virtual offset), %rbx +; movq rsp(80 + virtual offset), %rdx +; movq rsp(88 + virtual offset), %rcx +; movq rsp(96 + virtual offset), %rax +; addq %rsp, $112, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x70, %rsp +; block1: ; offset 0x8 +; movl $0xa, %esi +; movq %rsi, 0x60(%rsp) +; movl $0xf, %ecx +; movq %rcx, 0x58(%rsp) +; movl $0x14, %edx +; movq %rdx, 0x50(%rsp) +; movl $0x19, %ebx +; movq %rbx, 0x48(%rsp) +; movl $0x1e, %esi +; movq %rsi, 0x40(%rsp) +; movl $0x23, %edi +; movq %rdi, 0x38(%rsp) +; movl $0x28, %r8d +; movq %r8, 0x30(%rsp) +; movl $0x2d, %r9d +; movq %r9, 0x28(%rsp) +; movl $0x32, %r10d +; movq %r10, 0x20(%rsp) +; movl $0x37, %r11d +; movq %r11, 0x18(%rsp) +; movl $0x3c, %r12d +; movq %r12, 0x10(%rsp) +; movl $0x41, %r13d +; movq %r13, 8(%rsp) +; movl $0x46, %r14d +; movq %r14, (%rsp) +; movl $0x4b, %r15d +; movl $0x50, %ecx +; movl $0x55, %edx +; movl $0x5a, %ebx +; movl $0x5f, %esi +; movl $0x64, %edi +; movl $0x69, %r8d +; movl $0x6e, %r9d +; movl $0x73, %r10d +; movl $0x78, %r11d +; movl $0x7d, %r12d +; movl $0x82, %r13d +; movl $0x87, %r14d +; movq %r15, (%rax) +; movq %rcx, 8(%rax) +; movq %rdx, 0x10(%rax) +; movq %rbx, 0x18(%rax) +; movq %rsi, 0x20(%rax) +; movq %rdi, 0x28(%rax) +; movq %r8, 0x30(%rax) +; movq %r9, 0x38(%rax) +; movq %r10, 0x40(%rax) +; movq %r11, 0x48(%rax) +; movq %r12, 0x50(%rax) +; movq %r13, 0x58(%rax) +; movq %r14, 0x60(%rax) +; movq (%rsp), %r14 +; movq 8(%rsp), %r13 +; movq 0x10(%rsp), %r12 +; movq 0x18(%rsp), %r11 +; movq 0x20(%rsp), %r10 +; movq 0x28(%rsp), %r9 +; movq 0x30(%rsp), %r8 +; movq 0x38(%rsp), %rdi +; movq 0x40(%rsp), %rsi +; movq 0x48(%rsp), %rbx +; movq 0x50(%rsp), %rdx +; movq 0x58(%rsp), %rcx +; movq 0x60(%rsp), %rax +; addq $0x70, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %tail_caller_stack_rets() -> i64 tail { + fn0 = colocated %tail_callee_stack_rets() -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 = call fn0() + return v25 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; subq %rsp, $112, %rsp +; virtual_sp_offset_adjust 112 +; lea 0(%rsp), %rax +; call TestCase(%tail_callee_stack_rets) +; movq 0(%rsp), %r11 +; movq 8(%rsp), %rdi +; movq 16(%rsp), %rcx +; movq 24(%rsp), %r8 +; movq 32(%rsp), %r10 +; movq 40(%rsp), %rsi +; movq 48(%rsp), %rax +; movq 56(%rsp), %rdx +; movq 64(%rsp), %r9 +; movq 72(%rsp), %r11 +; movq 80(%rsp), %rdi +; movq 88(%rsp), %rcx +; movq 96(%rsp), %rax +; addq %rsp, $112, %rsp +; virtual_sp_offset_adjust -112 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; subq $0x70, %rsp +; leaq (%rsp), %rax +; callq 0x11 ; reloc_external CallPCRel4 %tail_callee_stack_rets -4 +; movq (%rsp), %r11 +; movq 8(%rsp), %rdi +; movq 0x10(%rsp), %rcx +; movq 0x18(%rsp), %r8 +; movq 0x20(%rsp), %r10 +; movq 0x28(%rsp), %rsi +; movq 0x30(%rsp), %rax +; movq 0x38(%rsp), %rdx +; movq 0x40(%rsp), %r9 +; movq 0x48(%rsp), %r11 +; movq 0x50(%rsp), %rdi +; movq 0x58(%rsp), %rcx +; movq 0x60(%rsp), %rax +; addq $0x70, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +;; Test the `tail` calling convention with non-tail calls and both stack +;; arguments and stack returns. + +function %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $112, %rsp +; block0: +; movq %rax, rsp(0 + virtual offset) +; movq %rcx, rsp(8 + virtual offset) +; movq %rdx, rsp(16 + virtual offset) +; movq %rbx, rsp(24 + virtual offset) +; movq %rsi, rsp(32 + virtual offset) +; movq %rdi, rsp(40 + virtual offset) +; movq %r8, rsp(48 + virtual offset) +; movq %r9, rsp(56 + virtual offset) +; movq %r10, rsp(64 + virtual offset) +; movq %r11, rsp(72 + virtual offset) +; movq %r12, rsp(80 + virtual offset) +; movq %r13, rsp(88 + virtual offset) +; movq %r14, rsp(96 + virtual offset) +; movq 16(%rbp), %rbx +; movq 24(%rbp), %r12 +; movq 32(%rbp), %r14 +; movq 40(%rbp), %rax +; movq 48(%rbp), %rdx +; movq 56(%rbp), %r9 +; movq 64(%rbp), %r11 +; movq 72(%rbp), %rdi +; movq 80(%rbp), %rcx +; movq 88(%rbp), %r8 +; movq 96(%rbp), %r10 +; movq 104(%rbp), %rsi +; movq 112(%rbp), %r15 +; movq 120(%rbp), %r13 +; movq %rbx, 0(%r13) +; movq %r12, 8(%r13) +; movq %r14, 16(%r13) +; movq %rax, 24(%r13) +; movq %rdx, 32(%r13) +; movq %r9, 40(%r13) +; movq %r11, 48(%r13) +; movq %rdi, 56(%r13) +; movq %rcx, 64(%r13) +; movq %r8, 72(%r13) +; movq %r10, 80(%r13) +; movq %rsi, 88(%r13) +; movq %r15, 96(%r13) +; movq rsp(0 + virtual offset), %rax +; movq rsp(8 + virtual offset), %rcx +; movq rsp(16 + virtual offset), %rdx +; movq rsp(24 + virtual offset), %rbx +; movq rsp(32 + virtual offset), %rsi +; movq rsp(40 + virtual offset), %rdi +; movq rsp(48 + virtual offset), %r8 +; movq rsp(56 + virtual offset), %r9 +; movq rsp(64 + virtual offset), %r10 +; movq rsp(72 + virtual offset), %r11 +; movq rsp(80 + virtual offset), %r12 +; movq rsp(88 + virtual offset), %r13 +; movq rsp(96 + virtual offset), %r14 +; addq %rsp, $112, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret 112 +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x70, %rsp +; block1: ; offset 0x8 +; movq %rax, (%rsp) +; movq %rcx, 8(%rsp) +; movq %rdx, 0x10(%rsp) +; movq %rbx, 0x18(%rsp) +; movq %rsi, 0x20(%rsp) +; movq %rdi, 0x28(%rsp) +; movq %r8, 0x30(%rsp) +; movq %r9, 0x38(%rsp) +; movq %r10, 0x40(%rsp) +; movq %r11, 0x48(%rsp) +; movq %r12, 0x50(%rsp) +; movq %r13, 0x58(%rsp) +; movq %r14, 0x60(%rsp) +; movq 0x10(%rbp), %rbx +; movq 0x18(%rbp), %r12 +; movq 0x20(%rbp), %r14 +; movq 0x28(%rbp), %rax +; movq 0x30(%rbp), %rdx +; movq 0x38(%rbp), %r9 +; movq 0x40(%rbp), %r11 +; movq 0x48(%rbp), %rdi +; movq 0x50(%rbp), %rcx +; movq 0x58(%rbp), %r8 +; movq 0x60(%rbp), %r10 +; movq 0x68(%rbp), %rsi +; movq 0x70(%rbp), %r15 +; movq 0x78(%rbp), %r13 +; movq %rbx, (%r13) +; movq %r12, 8(%r13) +; movq %r14, 0x10(%r13) +; movq %rax, 0x18(%r13) +; movq %rdx, 0x20(%r13) +; movq %r9, 0x28(%r13) +; movq %r11, 0x30(%r13) +; movq %rdi, 0x38(%r13) +; movq %rcx, 0x40(%r13) +; movq %r8, 0x48(%r13) +; movq %r10, 0x50(%r13) +; movq %rsi, 0x58(%r13) +; movq %r15, 0x60(%r13) +; movq (%rsp), %rax +; movq 8(%rsp), %rcx +; movq 0x10(%rsp), %rdx +; movq 0x18(%rsp), %rbx +; movq 0x20(%rsp), %rsi +; movq 0x28(%rsp), %rdi +; movq 0x30(%rsp), %r8 +; movq 0x38(%rsp), %r9 +; movq 0x40(%rsp), %r10 +; movq 0x48(%rsp), %r11 +; movq 0x50(%rsp), %r12 +; movq 0x58(%rsp), %r13 +; movq 0x60(%rsp), %r14 +; addq $0x70, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq $0x70 + +function %tail_caller_stack_args_and_rets() -> i64 tail { + fn0 = %tail_callee_stack_args_and_rets(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64 tail + +block0: + v0 = iconst.i64 10 + v1 = iconst.i64 15 + v2 = iconst.i64 20 + v3 = iconst.i64 25 + v4 = iconst.i64 30 + v5 = iconst.i64 35 + v6 = iconst.i64 40 + v7 = iconst.i64 45 + v8 = iconst.i64 50 + v9 = iconst.i64 55 + v10 = iconst.i64 60 + v11 = iconst.i64 65 + v12 = iconst.i64 70 + v13 = iconst.i64 75 + v14 = iconst.i64 80 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v51 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $96, %rsp +; block0: +; movl $10, %eax +; movq %rax, rsp(88 + virtual offset) +; movl $15, %ecx +; movq %rcx, rsp(80 + virtual offset) +; movl $20, %edx +; movq %rdx, rsp(72 + virtual offset) +; movl $25, %ebx +; movq %rbx, rsp(64 + virtual offset) +; movl $30, %esi +; movq %rsi, rsp(56 + virtual offset) +; movl $35, %edi +; movq %rdi, rsp(48 + virtual offset) +; movl $40, %r8d +; movq %r8, rsp(40 + virtual offset) +; movl $45, %r9d +; movq %r9, rsp(32 + virtual offset) +; movl $50, %r10d +; movq %r10, rsp(24 + virtual offset) +; movl $55, %r11d +; movq %r11, rsp(16 + virtual offset) +; movl $60, %r12d +; movq %r12, rsp(8 + virtual offset) +; movl $65, %r13d +; movq %r13, rsp(0 + virtual offset) +; movl $70, %r14d +; movl $75, %r15d +; movl $80, %eax +; movl $85, %ecx +; movl $90, %edx +; movl $95, %ebx +; movl $100, %esi +; movl $105, %edi +; movl $110, %r8d +; movl $115, %r9d +; movl $120, %r10d +; movl $125, %r11d +; movl $130, %r12d +; movl $135, %r13d +; subq %rsp, $224, %rsp +; virtual_sp_offset_adjust 224 +; movq %r15, 0(%rsp) +; movq %rax, 8(%rsp) +; movq %rcx, 16(%rsp) +; movq %rdx, 24(%rsp) +; movq %rbx, 32(%rsp) +; movq %rsi, 40(%rsp) +; movq %rdi, 48(%rsp) +; movq %r8, 56(%rsp) +; movq %r9, 64(%rsp) +; movq %r10, 72(%rsp) +; movq %r11, 80(%rsp) +; movq %r12, 88(%rsp) +; movq %r13, 96(%rsp) +; lea 112(%rsp), %rdx +; movq %rdx, 104(%rsp) +; load_ext_name %tail_callee_stack_args_and_rets+0, %r15 +; movq rsp(0 + virtual offset), %r13 +; movq rsp(8 + virtual offset), %r12 +; movq rsp(16 + virtual offset), %r11 +; movq rsp(24 + virtual offset), %r10 +; movq rsp(32 + virtual offset), %r9 +; movq rsp(40 + virtual offset), %r8 +; movq rsp(48 + virtual offset), %rdi +; movq rsp(56 + virtual offset), %rsi +; movq rsp(64 + virtual offset), %rbx +; movq rsp(72 + virtual offset), %rdx +; movq rsp(80 + virtual offset), %rcx +; movq rsp(88 + virtual offset), %rax +; call *%r15 +; movq 0(%rsp), %rdx +; movq 8(%rsp), %r9 +; movq 16(%rsp), %r11 +; movq 24(%rsp), %rdi +; movq 32(%rsp), %rcx +; movq 40(%rsp), %r8 +; movq 48(%rsp), %r10 +; movq 56(%rsp), %rsi +; movq 64(%rsp), %rax +; movq 72(%rsp), %rdx +; movq 80(%rsp), %r9 +; movq 88(%rsp), %r11 +; movq 96(%rsp), %rax +; addq %rsp, $112, %rsp +; virtual_sp_offset_adjust -112 +; addq %rsp, $96, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x60, %rsp +; block1: ; offset 0x8 +; movl $0xa, %eax +; movq %rax, 0x58(%rsp) +; movl $0xf, %ecx +; movq %rcx, 0x50(%rsp) +; movl $0x14, %edx +; movq %rdx, 0x48(%rsp) +; movl $0x19, %ebx +; movq %rbx, 0x40(%rsp) +; movl $0x1e, %esi +; movq %rsi, 0x38(%rsp) +; movl $0x23, %edi +; movq %rdi, 0x30(%rsp) +; movl $0x28, %r8d +; movq %r8, 0x28(%rsp) +; movl $0x2d, %r9d +; movq %r9, 0x20(%rsp) +; movl $0x32, %r10d +; movq %r10, 0x18(%rsp) +; movl $0x37, %r11d +; movq %r11, 0x10(%rsp) +; movl $0x3c, %r12d +; movq %r12, 8(%rsp) +; movl $0x41, %r13d +; movq %r13, (%rsp) +; movl $0x46, %r14d +; movl $0x4b, %r15d +; movl $0x50, %eax +; movl $0x55, %ecx +; movl $0x5a, %edx +; movl $0x5f, %ebx +; movl $0x64, %esi +; movl $0x69, %edi +; movl $0x6e, %r8d +; movl $0x73, %r9d +; movl $0x78, %r10d +; movl $0x7d, %r11d +; movl $0x82, %r12d +; movl $0x87, %r13d +; subq $0xe0, %rsp +; movq %r15, (%rsp) +; movq %rax, 8(%rsp) +; movq %rcx, 0x10(%rsp) +; movq %rdx, 0x18(%rsp) +; movq %rbx, 0x20(%rsp) +; movq %rsi, 0x28(%rsp) +; movq %rdi, 0x30(%rsp) +; movq %r8, 0x38(%rsp) +; movq %r9, 0x40(%rsp) +; movq %r10, 0x48(%rsp) +; movq %r11, 0x50(%rsp) +; movq %r12, 0x58(%rsp) +; movq %r13, 0x60(%rsp) +; leaq 0x70(%rsp), %rdx +; movq %rdx, 0x68(%rsp) +; movabsq $0, %r15 ; reloc_external Abs8 %tail_callee_stack_args_and_rets 0 +; movq 0xe0(%rsp), %r13 +; movq 0xe8(%rsp), %r12 +; movq 0xf0(%rsp), %r11 +; movq 0xf8(%rsp), %r10 +; movq 0x100(%rsp), %r9 +; movq 0x108(%rsp), %r8 +; movq 0x110(%rsp), %rdi +; movq 0x118(%rsp), %rsi +; movq 0x120(%rsp), %rbx +; movq 0x128(%rsp), %rdx +; movq 0x130(%rsp), %rcx +; movq 0x138(%rsp), %rax +; callq *%r15 +; movq (%rsp), %rdx +; movq 8(%rsp), %r9 +; movq 0x10(%rsp), %r11 +; movq 0x18(%rsp), %rdi +; movq 0x20(%rsp), %rcx +; movq 0x28(%rsp), %r8 +; movq 0x30(%rsp), %r10 +; movq 0x38(%rsp), %rsi +; movq 0x40(%rsp), %rax +; movq 0x48(%rsp), %rdx +; movq 0x50(%rsp), %r9 +; movq 0x58(%rsp), %r11 +; movq 0x60(%rsp), %rax +; addq $0x70, %rsp +; addq $0x60, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/runtests/tail-call-conv.clif b/cranelift/filetests/filetests/runtests/tail-call-conv.clif index d0c7a1a4d5ef..efa9431d6e6b 100644 --- a/cranelift/filetests/filetests/runtests/tail-call-conv.clif +++ b/cranelift/filetests/filetests/runtests/tail-call-conv.clif @@ -23,7 +23,7 @@ block0(v0: i8, v1: i16, v2: i32, v3: i64, v4: f32, v5: f64): return v11, v10, v9, v8, v7, v6 } -function %tail_caller() -> f64, f32, i64, i32, i16, i8 { +function %tail_caller() -> f64, f32, i64, i32, i16, i8 tail { fn0 = %tail_callee(i8, i16, i32, i64, f32, f64) -> f64, f32, i64, i32, i16, i8 tail block0: @@ -41,13 +41,13 @@ block0: ;; Test the `tail` calling convention with non-tail calls and stack arguments. -function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { -block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64): - return v14 +function %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail { +block0(v0: i64, v1: i64, v2: i64, v3: i64, v4: i64, v5: i64, v6: i64, v7: i64, v8: i64, v9: i64, v10: i64, v11: i64, v12: i64, v13: i64, v14: i64, v15: i64, v16: i64, v17: i64, v18: i64, v19: i64, v20: i64, v21: i64, v22: i64, v23: i64, v24: i64, v25: i64): + return v25 } -function %tail_caller_stack_args() -> i64 { - fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail +function %tail_caller_stack_args() -> i64 tail { + fn0 = %tail_callee_stack_args(i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64) -> i64 tail block0: v0 = iconst.i64 10 @@ -65,11 +65,22 @@ block0: v12 = iconst.i64 70 v13 = iconst.i64 75 v14 = iconst.i64 80 - v15 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14) - return v15 + v15 = iconst.i64 85 + v16 = iconst.i64 90 + v17 = iconst.i64 95 + v18 = iconst.i64 100 + v19 = iconst.i64 105 + v20 = iconst.i64 110 + v21 = iconst.i64 115 + v22 = iconst.i64 120 + v23 = iconst.i64 125 + v24 = iconst.i64 130 + v25 = iconst.i64 135 + v26 = call fn0(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25) + return v26 } -; run: %tail_caller_stack_args() == 80 +; run: %tail_caller_stack_args() == 135 ;; Test the `tail` calling convention with non-tail calls and stack returns.