From 1c05e06bd5cacf871a42b8b8bc374c0187b19fcb Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Mon, 14 Jun 2021 08:48:34 +0100 Subject: [PATCH 1/2] aarch64: Implement I128 Loads and Stores --- cranelift/codegen/src/isa/aarch64/lower.rs | 140 +++++++++++---- .../codegen/src/isa/aarch64/lower_inst.rs | 150 +++++++++------- .../filetests/isa/aarch64/stack.clif | 164 ++++++++++++++++++ .../filetests/runtests/i128-load-store.clif | 91 ++++++++++ 4 files changed, 451 insertions(+), 94 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/i128-load-store.clif diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index bdece7311dff..e7853ca98e6c 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -692,6 +692,64 @@ fn collect_address_addends>( (result64, result32, offset) } +/// Lower the address of a pair load or store. +pub(crate) fn lower_pair_address>( + ctx: &mut C, + roots: &[InsnInput], + offset: i32, +) -> PairAMode { + // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero + // extends and addition ops. We update these as we consume address + // components, so they represent the remaining addends not yet handled. + let (addends64, addends32, args_offset) = collect_address_addends(ctx, roots); + let offset = args_offset + (offset as i64); + + trace!( + "lower_pair_address: addends64 {:?}, addends32 {:?}, offset {}", + addends64, + addends32, + offset + ); + + // Pairs basically only have reg + imm formats so we only have to worry about those + + let imm7_offset = SImm7Scaled::maybe_from_i64(offset, I64); + match (&addends64[..], &addends32[..], imm7_offset) { + (&[add64], &[], Some(offset)) => PairAMode::SignedOffset(add64, offset), + (&[], &[add32], Some(offset)) => { + let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); + let (reg, extendop) = add32; + let signed = match extendop { + ExtendOp::SXTW => true, + ExtendOp::UXTW => false, + _ => unreachable!(), + }; + ctx.emit(Inst::Extend { + rd: tmp, + rn: reg, + signed, + from_bits: 32, + to_bits: 64, + }); + PairAMode::SignedOffset(tmp.to_reg(), offset) + } + (&[], &[], Some(offset)) => PairAMode::SignedOffset(zero_reg(), offset), + + (_, _, _) => { + // This is the general case, we just grab all addends and sum them into a register + let addr = ctx.alloc_tmp(I64).only_reg().unwrap(); + lower_add_addends(ctx, addr, addends64, addends32); + + let imm7 = imm7_offset.unwrap_or_else(|| { + lower_add_immediate(ctx, addr, addr.to_reg(), offset); + SImm7Scaled::maybe_from_i64(0, I64).unwrap() + }); + + PairAMode::SignedOffset(addr.to_reg(), imm7) + } + } +} + /// Lower the address of a load or store. pub(crate) fn lower_address>( ctx: &mut C, @@ -792,36 +850,23 @@ pub(crate) fn lower_address>( // If there is any offset, load that first into `addr`, and add the `reg` // that we kicked out of the `AMode`; otherwise, start with that reg. if offset != 0 { - // If we can fit offset or -offset in an imm12, use an add-imm - // to combine the reg and offset. Otherwise, load value first then add. - if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) { - ctx.emit(Inst::AluRRImm12 { - alu_op: ALUOp::Add64, - rd: addr, - rn: reg, - imm12, - }); - } else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) { - ctx.emit(Inst::AluRRImm12 { - alu_op: ALUOp::Sub64, - rd: addr, - rn: reg, - imm12, - }); - } else { - lower_constant_u64(ctx, addr, offset as u64); - ctx.emit(Inst::AluRRR { - alu_op: ALUOp::Add64, - rd: addr, - rn: addr.to_reg(), - rm: reg, - }); - } + lower_add_immediate(ctx, addr, reg, offset) } else { ctx.emit(Inst::gen_move(addr, reg, I64)); } // Now handle reg64 and reg32-extended components. + lower_add_addends(ctx, addr, addends64, addends32); + + memarg +} + +fn lower_add_addends>( + ctx: &mut C, + rd: Writable, + addends64: AddressAddend64List, + addends32: AddressAddend32List, +) { for reg in addends64 { // If the register is the stack reg, we must move it to another reg // before adding it. @@ -834,8 +879,8 @@ pub(crate) fn lower_address>( }; ctx.emit(Inst::AluRRR { alu_op: ALUOp::Add64, - rd: addr, - rn: addr.to_reg(), + rd, + rn: rd.to_reg(), rm: reg, }); } @@ -843,14 +888,42 @@ pub(crate) fn lower_address>( assert!(reg != stack_reg()); ctx.emit(Inst::AluRRRExtend { alu_op: ALUOp::Add64, - rd: addr, - rn: addr.to_reg(), + rd, + rn: rd.to_reg(), rm: reg, extendop, }); } +} - memarg +/// Adds into `rd` a signed imm pattern matching the best instruction for it. +// TODO: This function is duplicated in ctx.gen_add_imm +fn lower_add_immediate>(ctx: &mut C, dst: Writable, src: Reg, imm: i64) { + // If we can fit offset or -offset in an imm12, use an add-imm + // Otherwise, lower the constant first then add. + if let Some(imm12) = Imm12::maybe_from_u64(imm as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: dst, + rn: src, + imm12, + }); + } else if let Some(imm12) = Imm12::maybe_from_u64(imm.wrapping_neg() as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: dst, + rn: src, + imm12, + }); + } else { + lower_constant_u64(ctx, dst, imm as u64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: dst, + rn: dst.to_reg(), + rm: src, + }); + } } pub(crate) fn lower_constant_u64>( @@ -1248,7 +1321,10 @@ fn load_op_to_ty(op: Opcode) -> Option { /// Helper to lower a load instruction; this is used in several places, because /// a load can sometimes be merged into another operation. -pub(crate) fn lower_load, F: FnMut(&mut C, Writable, Type, AMode)>( +pub(crate) fn lower_load< + C: LowerCtx, + F: FnMut(&mut C, ValueRegs>, Type, AMode), +>( ctx: &mut C, ir_inst: IRInst, inputs: &[InsnInput], @@ -1261,7 +1337,7 @@ pub(crate) fn lower_load, F: FnMut(&mut C, Writable, let off = ctx.data(ir_inst).load_store_offset().unwrap(); let mem = lower_address(ctx, elem_ty, &inputs[..], off); - let rd = get_output_reg(ctx, output).only_reg().unwrap(); + let rd = get_output_reg(ctx, output); f(ctx, rd, elem_ty, mem); } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 690eacc29889..36602cfe3d15 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -1180,56 +1180,71 @@ pub(crate) fn lower_insn_to_regs>( .memflags(insn) .expect("Load instruction should have memflags"); - lower_load( - ctx, - insn, - &inputs[..], - outputs[0], - |ctx, rd, elem_ty, mem| { - let is_float = ty_has_float_or_vec_representation(elem_ty); - ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { - (1, _, _) => Inst::ULoad8 { rd, mem, flags }, - (8, false, _) => Inst::ULoad8 { rd, mem, flags }, - (8, true, _) => Inst::SLoad8 { rd, mem, flags }, - (16, false, _) => Inst::ULoad16 { rd, mem, flags }, - (16, true, _) => Inst::SLoad16 { rd, mem, flags }, - (32, false, false) => Inst::ULoad32 { rd, mem, flags }, - (32, true, false) => Inst::SLoad32 { rd, mem, flags }, - (32, _, true) => Inst::FpuLoad32 { rd, mem, flags }, - (64, _, false) => Inst::ULoad64 { rd, mem, flags }, - // Note that we treat some of the vector loads as scalar floating-point loads, - // which is correct in a little endian environment. - (64, _, true) => Inst::FpuLoad64 { rd, mem, flags }, - (128, _, _) => Inst::FpuLoad128 { rd, mem, flags }, - _ => panic!("Unsupported size in load"), - }); - - let vec_extend = match op { - Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), - Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8), - Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), - Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8), - Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), - Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16), - Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), - Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16), - Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), - Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32), - Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), - Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32), - _ => None, - }; - - if let Some(t) = vec_extend { - ctx.emit(Inst::VecExtend { - t, - rd, - rn: rd.to_reg(), - high_half: false, + let out_ty = ctx.output_ty(insn, 0); + if out_ty == I128 { + let off = ctx.data(insn).load_store_offset().unwrap(); + let mem = lower_pair_address(ctx, &inputs[..], off); + let dst = get_output_reg(ctx, outputs[0]); + ctx.emit(Inst::LoadP64 { + rt: dst.regs()[0], + rt2: dst.regs()[1], + mem, + flags, + }); + } else { + lower_load( + ctx, + insn, + &inputs[..], + outputs[0], + |ctx, dst, elem_ty, mem| { + let rd = dst.only_reg().unwrap(); + let is_float = ty_has_float_or_vec_representation(elem_ty); + ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { + (1, _, _) => Inst::ULoad8 { rd, mem, flags }, + (8, false, _) => Inst::ULoad8 { rd, mem, flags }, + (8, true, _) => Inst::SLoad8 { rd, mem, flags }, + (16, false, _) => Inst::ULoad16 { rd, mem, flags }, + (16, true, _) => Inst::SLoad16 { rd, mem, flags }, + (32, false, false) => Inst::ULoad32 { rd, mem, flags }, + (32, true, false) => Inst::SLoad32 { rd, mem, flags }, + (32, _, true) => Inst::FpuLoad32 { rd, mem, flags }, + (64, _, false) => Inst::ULoad64 { rd, mem, flags }, + // Note that we treat some of the vector loads as scalar floating-point loads, + // which is correct in a little endian environment. + (64, _, true) => Inst::FpuLoad64 { rd, mem, flags }, + (128, _, true) => Inst::FpuLoad128 { rd, mem, flags }, + _ => panic!("Unsupported size in load"), }); - } - }, - ); + + let vec_extend = match op { + Opcode::Sload8x8 => Some(VecExtendOp::Sxtl8), + Opcode::Sload8x8Complex => Some(VecExtendOp::Sxtl8), + Opcode::Uload8x8 => Some(VecExtendOp::Uxtl8), + Opcode::Uload8x8Complex => Some(VecExtendOp::Uxtl8), + Opcode::Sload16x4 => Some(VecExtendOp::Sxtl16), + Opcode::Sload16x4Complex => Some(VecExtendOp::Sxtl16), + Opcode::Uload16x4 => Some(VecExtendOp::Uxtl16), + Opcode::Uload16x4Complex => Some(VecExtendOp::Uxtl16), + Opcode::Sload32x2 => Some(VecExtendOp::Sxtl32), + Opcode::Sload32x2Complex => Some(VecExtendOp::Sxtl32), + Opcode::Uload32x2 => Some(VecExtendOp::Uxtl32), + Opcode::Uload32x2Complex => Some(VecExtendOp::Uxtl32), + _ => None, + }; + + if let Some(t) = vec_extend { + let rd = dst.only_reg().unwrap(); + ctx.emit(Inst::VecExtend { + t, + rd, + rn: rd.to_reg(), + high_half: false, + }); + } + }, + ); + } } Opcode::Store @@ -1253,19 +1268,30 @@ pub(crate) fn lower_insn_to_regs>( .memflags(insn) .expect("Store instruction should have memflags"); - let mem = lower_address(ctx, elem_ty, &inputs[1..], off); - let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(match (ty_bits(elem_ty), is_float) { - (1, _) | (8, _) => Inst::Store8 { rd, mem, flags }, - (16, _) => Inst::Store16 { rd, mem, flags }, - (32, false) => Inst::Store32 { rd, mem, flags }, - (32, true) => Inst::FpuStore32 { rd, mem, flags }, - (64, false) => Inst::Store64 { rd, mem, flags }, - (64, true) => Inst::FpuStore64 { rd, mem, flags }, - (128, _) => Inst::FpuStore128 { rd, mem, flags }, - _ => panic!("Unsupported size in store"), - }); + let dst = put_input_in_regs(ctx, inputs[0]); + + if elem_ty == I128 { + let mem = lower_pair_address(ctx, &inputs[1..], off); + ctx.emit(Inst::StoreP64 { + rt: dst.regs()[0], + rt2: dst.regs()[1], + mem, + flags, + }); + } else { + let rd = dst.only_reg().unwrap(); + let mem = lower_address(ctx, elem_ty, &inputs[1..], off); + ctx.emit(match (ty_bits(elem_ty), is_float) { + (1, _) | (8, _) => Inst::Store8 { rd, mem, flags }, + (16, _) => Inst::Store16 { rd, mem, flags }, + (32, false) => Inst::Store32 { rd, mem, flags }, + (32, true) => Inst::FpuStore32 { rd, mem, flags }, + (64, false) => Inst::Store64 { rd, mem, flags }, + (64, true) => Inst::FpuStore64 { rd, mem, flags }, + (128, _) => Inst::FpuStore128 { rd, mem, flags }, + _ => panic!("Unsupported size in store"), + }); + } } Opcode::StackAddr => { diff --git a/cranelift/filetests/filetests/isa/aarch64/stack.clif b/cranelift/filetests/filetests/isa/aarch64/stack.clif index 078922557ff9..17deccf7fd14 100644 --- a/cranelift/filetests/filetests/isa/aarch64/stack.clif +++ b/cranelift/filetests/filetests/isa/aarch64/stack.clif @@ -276,3 +276,167 @@ block0(v0: b1): return v0, v137 } + + + +function %i128_stack_store(i128) { +ss0 = explicit_slot 16 + +block0(v0: i128): + stack_store.i128 v0, ss0 + return +} +; TODO: Codegen improvement opportunities: This should be just a stp x0, x1, [sp, #-16] +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #16 +; nextln: mov x2, sp +; nextln: stp x0, x1, [x2] +; nextln: add sp, sp, #16 +; nextln: ldp fp, lr, [sp], #16 + + +function %i128_stack_store_slot_offset(i128) { +ss0 = explicit_slot 16, offset 16 + +block0(v0: i128): + stack_store.i128 v0, ss0 + return +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #16 +; nextln: mov x2, sp +; nextln: stp x0, x1, [x2] +; nextln: add sp, sp, #16 +; nextln: ldp fp, lr, [sp], #16 + + +function %i128_stack_store_inst_offset(i128) { +ss0 = explicit_slot 16 +ss1 = explicit_slot 16 + +block0(v0: i128): + stack_store.i128 v0, ss1+16 + return +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #32 +; nextln: add x2, sp, #32 +; nextln: stp x0, x1, [x2] +; nextln: add sp, sp, #32 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_stack_store_big(i128) { +ss0 = explicit_slot 100000 +ss1 = explicit_slot 8 + +block0(v0: i128): + stack_store.i128 v0, ss0 + return +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz w16, #34480 +; nextln: movk w16, #1, LSL #16 +; nextln: sub sp, sp, x16, UXTX +; nextln: mov x2, sp +; nextln: stp x0, x1, [x2] +; nextln: movz w16, #34480 +; nextln: movk w16, #1, LSL #16 +; nextln: add sp, sp, x16, UXTX +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +function %i128_stack_load() -> i128 { +ss0 = explicit_slot 16 + +block0: + v0 = stack_load.i128 ss0 + return v0 +} +; TODO: Codegen improvement opportunities: This should be just a ldp x0, x1, [sp, #-16] +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #16 +; nextln: mov x0, sp +; nextln: ldp x1, x0, [x0] +; nextln: mov x2, x0 +; nextln: mov x0, x1 +; nextln: mov x1, x2 +; nextln: add sp, sp, #16 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_stack_load_slot_offset() -> i128 { +ss0 = explicit_slot 16, offset 16 + +block0: + v0 = stack_load.i128 ss0 + return v0 +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #16 +; nextln: mov x0, sp +; nextln: ldp x1, x0, [x0] +; nextln: mov x2, x0 +; nextln: mov x0, x1 +; nextln: mov x1, x2 +; nextln: add sp, sp, #16 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +function %i128_stack_load_inst_offset() -> i128 { +ss0 = explicit_slot 16 +ss1 = explicit_slot 16 + +block0: + v0 = stack_load.i128 ss1+16 + return v0 +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub sp, sp, #32 +; nextln: add x0, sp, #32 +; nextln: ldp x1, x0, [x0] +; nextln: mov x2, x0 +; nextln: mov x0, x1 +; nextln: mov x1, x2 +; nextln: add sp, sp, #32 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + + +function %i128_stack_load_big() -> i128 { +ss0 = explicit_slot 100000 +ss1 = explicit_slot 8 + +block0: + v0 = stack_load.i128 ss0 + return v0 +} +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz w16, #34480 +; nextln: movk w16, #1, LSL #16 +; nextln: sub sp, sp, x16, UXTX +; nextln: mov x0, sp +; nextln: ldp x1, x0, [x0] +; nextln: mov x2, x0 +; nextln: mov x0, x1 +; nextln: mov x1, x2 +; nextln: movz w16, #34480 +; nextln: movk w16, #1, LSL #16 +; nextln: add sp, sp, x16, UXTX +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/runtests/i128-load-store.clif b/cranelift/filetests/filetests/runtests/i128-load-store.clif new file mode 100644 index 000000000000..b02e8ec26b66 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif @@ -0,0 +1,91 @@ +test run +target x86_64 machinst +target aarch64 + +function %i128_stack_store_load(i64, i64) -> b1 { + ss0 = explicit_slot 16 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + stack_store.i128 v2, ss0 + v3 = stack_load.i128 ss0 + + v4 = icmp.i128 eq v2, v3 + return v4 +} +; run: %i128_stack_store_load(0, 0) == true +; run: %i128_stack_store_load(-1, -1) == true +; run: %i128_stack_store_load(-1, 0) == true +; run: %i128_stack_store_load(0, -1) == true +; run: %i128_stack_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_stack_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_stack_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + +function %i128_stack_store_load_offset(i64, i64) -> b1 { + ss0 = explicit_slot 16, offset 16 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + stack_store.i128 v2, ss0 + v3 = stack_load.i128 ss0 + + v4 = icmp.i128 eq v2, v3 + return v4 +} +; run: %i128_stack_store_load_offset(0, 0) == true +; run: %i128_stack_store_load_offset(-1, -1) == true +; run: %i128_stack_store_load_offset(-1, 0) == true +; run: %i128_stack_store_load_offset(0, -1) == true +; run: %i128_stack_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_stack_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_stack_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + +function %i128_stack_store_load_inst_offset(i64, i64) -> b1 { + ss0 = explicit_slot 16 + ss1 = explicit_slot 16 + ss2 = explicit_slot 16 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + stack_store.i128 v2, ss1+16 + v3 = stack_load.i128 ss1+16 + + v4 = icmp.i128 eq v2, v3 + return v4 +} +; run: %i128_stack_store_load_inst_offset(0, 0) == true +; run: %i128_stack_store_load_inst_offset(-1, -1) == true +; run: %i128_stack_store_load_inst_offset(-1, 0) == true +; run: %i128_stack_store_load_inst_offset(0, -1) == true +; run: %i128_stack_store_load_inst_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_stack_store_load_inst_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_stack_store_load_inst_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + +; Some arches (aarch64) try to encode the offset into the load/store instructions +; test that we spill if the offset is too large and doesn't fit in the instruction +function %i128_stack_store_load_big_offset(i64, i64) -> b1 { + ss0 = explicit_slot 100000 + ss1 = explicit_slot 8 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + stack_store.i128 v2, ss0 + v3 = stack_load.i128 ss0 + + v4 = icmp.i128 eq v2, v3 + return v4 +} +; run: %i128_stack_store_load_big_offset(0, 0) == true +; run: %i128_stack_store_load_big_offset(-1, -1) == true +; run: %i128_stack_store_load_big_offset(-1, 0) == true +; run: %i128_stack_store_load_big_offset(0, -1) == true +; run: %i128_stack_store_load_big_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_stack_store_load_big_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_stack_store_load_big_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true From c82764605fc6aafaf941f7f2de163e8b3065c9ea Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Thu, 17 Jun 2021 15:50:08 +0100 Subject: [PATCH 2/2] aarch64: Add i128 load & store tests and refactor address calculation The previous address calculation code had a bug where we tried to add offsets into a temporary register before defining it, causing the regalloc to complain. --- cranelift/codegen/src/isa/aarch64/lower.rs | 65 +++++---- .../filetests/isa/aarch64/amodes.clif | 127 ++++++++++++++++++ .../filetests/runtests/i128-load-store.clif | 45 +++++++ 3 files changed, 204 insertions(+), 33 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index e7853ca98e6c..039f3b3cd3f9 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -701,7 +701,7 @@ pub(crate) fn lower_pair_address>( // Collect addends through an arbitrary tree of 32-to-64-bit sign/zero // extends and addition ops. We update these as we consume address // components, so they represent the remaining addends not yet handled. - let (addends64, addends32, args_offset) = collect_address_addends(ctx, roots); + let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots); let offset = args_offset + (offset as i64); trace!( @@ -713,41 +713,40 @@ pub(crate) fn lower_pair_address>( // Pairs basically only have reg + imm formats so we only have to worry about those - let imm7_offset = SImm7Scaled::maybe_from_i64(offset, I64); - match (&addends64[..], &addends32[..], imm7_offset) { - (&[add64], &[], Some(offset)) => PairAMode::SignedOffset(add64, offset), - (&[], &[add32], Some(offset)) => { - let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); - let (reg, extendop) = add32; - let signed = match extendop { - ExtendOp::SXTW => true, - ExtendOp::UXTW => false, - _ => unreachable!(), - }; - ctx.emit(Inst::Extend { - rd: tmp, - rn: reg, - signed, - from_bits: 32, - to_bits: 64, - }); - PairAMode::SignedOffset(tmp.to_reg(), offset) - } - (&[], &[], Some(offset)) => PairAMode::SignedOffset(zero_reg(), offset), + let base_reg = if let Some(reg64) = addends64.pop() { + reg64 + } else if let Some((reg32, extendop)) = addends32.pop() { + let tmp = ctx.alloc_tmp(I64).only_reg().unwrap(); + let signed = match extendop { + ExtendOp::SXTW => true, + ExtendOp::UXTW => false, + _ => unreachable!(), + }; + ctx.emit(Inst::Extend { + rd: tmp, + rn: reg32, + signed, + from_bits: 32, + to_bits: 64, + }); + tmp.to_reg() + } else { + zero_reg() + }; + + let addr = ctx.alloc_tmp(I64).only_reg().unwrap(); + ctx.emit(Inst::gen_move(addr, base_reg, I64)); - (_, _, _) => { - // This is the general case, we just grab all addends and sum them into a register - let addr = ctx.alloc_tmp(I64).only_reg().unwrap(); - lower_add_addends(ctx, addr, addends64, addends32); + // We have the base register, if we have any others, we need to add them + lower_add_addends(ctx, addr, addends64, addends32); - let imm7 = imm7_offset.unwrap_or_else(|| { - lower_add_immediate(ctx, addr, addr.to_reg(), offset); - SImm7Scaled::maybe_from_i64(0, I64).unwrap() - }); + // Figure out what offset we should emit + let imm7 = SImm7Scaled::maybe_from_i64(offset, I64).unwrap_or_else(|| { + lower_add_immediate(ctx, addr, addr.to_reg(), offset); + SImm7Scaled::maybe_from_i64(0, I64).unwrap() + }); - PairAMode::SignedOffset(addr.to_reg(), imm7) - } - } + PairAMode::SignedOffset(addr.to_reg(), imm7) } /// Lower the address of a load or store. diff --git a/cranelift/filetests/filetests/isa/aarch64/amodes.clif b/cranelift/filetests/filetests/isa/aarch64/amodes.clif index 837ba1815fe1..fbab91d7f72e 100644 --- a/cranelift/filetests/filetests/isa/aarch64/amodes.clif +++ b/cranelift/filetests/filetests/isa/aarch64/amodes.clif @@ -386,3 +386,130 @@ block0(v0: i64, v1: i64, v2: i64): ; nextln: ldrsh x0, [x0] ; nextln: ldp fp, lr, [sp], #16 ; nextln: ret + + +function %i128(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0 + store.i128 v1, v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1] +; nextln: stp x2, x1, [x0] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_imm_offset(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0+16 + store.i128 v1, v0+16 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #16] +; nextln: stp x2, x1, [x0, #16] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %i128_imm_offset_large(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0+504 + store.i128 v1, v0+504 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #504] +; nextln: stp x2, x1, [x0, #504] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %i128_imm_offset_negative_large(i64) -> i128 { +block0(v0: i64): + v1 = load.i128 v0-512 + store.i128 v1, v0-512 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #-512] +; nextln: stp x2, x1, [x0, #-512] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_add_offset(i64) -> i128 { +block0(v0: i64): + v1 = iadd_imm v0, 32 + v2 = load.i128 v1 + store.i128 v2, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x1, x0 +; nextln: ldp x2, x1, [x1, #32] +; nextln: stp x2, x1, [x0, #32] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_32bit_sextend_simple(i32) -> i128 { +block0(v0: i32): + v1 = sextend.i64 v0 + v2 = load.i128 v1 + store.i128 v2, v1 + return v2 +} + +; TODO: We should be able to deduplicate the sxtw instruction +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x1, w0 +; nextln: ldp x2, x1, [x1] +; nextln: sxtw x0, w0 +; nextln: stp x2, x1, [x0] +; nextln: mov x0, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %i128_32bit_sextend(i64, i32) -> i128 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iadd.i64 v0, v2 + v4 = iadd_imm.i64 v3, 24 + v5 = load.i128 v4 + store.i128 v5, v4 + return v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov x2, x0 +; nextln: add x2, x2, x1, SXTW +; nextln: ldp x3, x2, [x2, #24] +; nextln: add x0, x0, x1, SXTW +; nextln: stp x3, x2, [x0, #24] +; nextln: mov x0, x3 +; nextln: mov x1, x2 +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/runtests/i128-load-store.clif b/cranelift/filetests/filetests/runtests/i128-load-store.clif index b02e8ec26b66..41046e871744 100644 --- a/cranelift/filetests/filetests/runtests/i128-load-store.clif +++ b/cranelift/filetests/filetests/runtests/i128-load-store.clif @@ -89,3 +89,48 @@ block0(v0: i64,v1: i64): ; run: %i128_stack_store_load_big_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true ; run: %i128_stack_store_load_big_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true ; run: %i128_stack_store_load_big_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + + +function %i128_store_load(i64, i64) -> b1 { + ss0 = explicit_slot 16 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + v3 = stack_addr.i64 ss0 + store.i128 v2, v3 + v4 = load.i128 v3 + + v5 = icmp.i128 eq v2, v4 + return v5 +} +; run: %i128_store_load(0, 0) == true +; run: %i128_store_load(-1, -1) == true +; run: %i128_store_load(-1, 0) == true +; run: %i128_store_load(0, -1) == true +; run: %i128_store_load(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_store_load(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_store_load(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true + + +function %i128_store_load_offset(i64, i64) -> b1 { + ss0 = explicit_slot 32 + +block0(v0: i64,v1: i64): + v2 = iconcat v0, v1 + + v3 = stack_addr.i64 ss0 + store.i128 v2, v3+16 + v4 = load.i128 v3+16 + + v5 = icmp.i128 eq v2, v4 + return v5 +} +; run: %i128_store_load_offset(0, 0) == true +; run: %i128_store_load_offset(-1, -1) == true +; run: %i128_store_load_offset(-1, 0) == true +; run: %i128_store_load_offset(0, -1) == true +; run: %i128_store_load_offset(0x01234567_89ABCDEF, 0xFEDCBA98_76543210) == true +; run: %i128_store_load_offset(0x06060606_06060606, 0xA00A00A0_0A00A00A) == true +; run: %i128_store_load_offset(0xC0FFEEEE_DECAFFFF, 0xDECAFFFF_C0FFEEEE) == true