Skip to content

Commit

Permalink
Merge pull request #3032 from afonso360/aarch64-rmw
Browse files Browse the repository at this point in the history
aarch64: Implement missing atomic rmw ops
  • Loading branch information
cfallin authored Jun 25, 2021
2 parents 1047c4e + e85eb77 commit bc6f751
Show file tree
Hide file tree
Showing 3 changed files with 520 additions and 19 deletions.
105 changes: 86 additions & 19 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,8 @@ impl MachInstEmit for Inst {
mov x28, x26
so that we simply write in the destination, the "2nd arg for op".
*/
// TODO: We should not hardcode registers here, a better idea would be to
// pass some scratch registers in the AtomicRMW pseudo-instruction, and use those
let xzr = zero_reg();
let x24 = xreg(24);
let x25 = xreg(25);
Expand All @@ -1294,25 +1296,90 @@ impl MachInstEmit for Inst {
}
sink.put4(enc_ldxr(ty, x27wr, x25)); // ldxr x27, [x25]

if op == inst_common::AtomicRmwOp::Xchg {
// mov x28, x26
sink.put4(enc_arith_rrr(0b101_01010_00_0, 0b000000, x28wr, xzr, x26))
} else {
// add/sub/and/orr/eor x28, x27, x26
let bits_31_21 = match op {
inst_common::AtomicRmwOp::Add => 0b100_01011_00_0,
inst_common::AtomicRmwOp::Sub => 0b110_01011_00_0,
inst_common::AtomicRmwOp::And => 0b100_01010_00_0,
inst_common::AtomicRmwOp::Or => 0b101_01010_00_0,
inst_common::AtomicRmwOp::Xor => 0b110_01010_00_0,
inst_common::AtomicRmwOp::Nand
| inst_common::AtomicRmwOp::Umin
| inst_common::AtomicRmwOp::Umax
| inst_common::AtomicRmwOp::Smin
| inst_common::AtomicRmwOp::Smax => todo!("{:?}", op),
inst_common::AtomicRmwOp::Xchg => unreachable!(),
};
sink.put4(enc_arith_rrr(bits_31_21, 0b000000, x28wr, x27, x26));
match op {
AtomicRmwOp::Xchg => {
// mov x28, x26
Inst::Mov64 { rd: x28wr, rm: x26 }.emit(sink, emit_info, state);
}
AtomicRmwOp::Nand => {
// and x28, x27, x26
// mvn x28, x28

Inst::AluRRR {
alu_op: ALUOp::And64,
rd: x28wr,
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);

Inst::AluRRR {
alu_op: ALUOp::OrrNot64,
rd: x28wr,
rn: xzr,
rm: x28,
}
.emit(sink, emit_info, state);
}
AtomicRmwOp::Umin
| AtomicRmwOp::Umax
| AtomicRmwOp::Smin
| AtomicRmwOp::Smax => {
// cmp x27, x26
// csel.op x28, x27, x26

let cond = match op {
AtomicRmwOp::Umin => Cond::Lo,
AtomicRmwOp::Umax => Cond::Hi,
AtomicRmwOp::Smin => Cond::Lt,
AtomicRmwOp::Smax => Cond::Gt,
_ => unreachable!(),
};

Inst::AluRRR {
alu_op: if ty == I64 {
ALUOp::SubS64
} else {
ALUOp::SubS32
},
rd: writable_zero_reg(),
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);

Inst::CSel {
cond,
rd: x28wr,
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);
}
_ => {
// add/sub/and/orr/eor x28, x27, x26
let alu_op = match op {
AtomicRmwOp::Add => ALUOp::Add64,
AtomicRmwOp::Sub => ALUOp::Sub64,
AtomicRmwOp::And => ALUOp::And64,
AtomicRmwOp::Or => ALUOp::Orr64,
AtomicRmwOp::Xor => ALUOp::Eor64,
AtomicRmwOp::Nand
| AtomicRmwOp::Umin
| AtomicRmwOp::Umax
| AtomicRmwOp::Smin
| AtomicRmwOp::Smax
| AtomicRmwOp::Xchg => unreachable!(),
};

Inst::AluRRR {
alu_op,
rd: x28wr,
rn: x27,
rm: x26,
}
.emit(sink, emit_info, state);
}
}

let srcloc = state.cur_srcloc();
Expand Down
237 changes: 237 additions & 0 deletions cranelift/filetests/filetests/runtests/atomic-rmw-2.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
test run
target aarch64
target x86_64 machinst
; TODO: Merge this with atomic-rmw.clif when s390x supports it


function %atomic_rmw_nand_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 nand v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_nand_i64(0, 0) == -1
; run: %atomic_rmw_nand_i64(1, 0) == -1
; run: %atomic_rmw_nand_i64(0, 1) == -1
; run: %atomic_rmw_nand_i64(1, 1) == -2
; run: %atomic_rmw_nand_i64(0xC0FFEEEE_DECAFFFF, 0x7DCB5691_7DCB5691) == 0xBF34B97F_A335A96E

function %atomic_rmw_nand_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 nand v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_nand_i32(0, 0) == -1
; run: %atomic_rmw_nand_i32(1, 0) == -1
; run: %atomic_rmw_nand_i32(0, 1) == -1
; run: %atomic_rmw_nand_i32(1, 1) == -2
; run: %atomic_rmw_nand_i32(0xC0FFEEEE, 0x7DCB5691) == 0xBF34B97F



function %atomic_rmw_umin_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 umin v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_umin_i64(0, 0) == 0
; run: %atomic_rmw_umin_i64(1, 0) == 0
; run: %atomic_rmw_umin_i64(0, 1) == 0
; run: %atomic_rmw_umin_i64(1, 1) == 1
; run: %atomic_rmw_umin_i64(-1, 1) == 1
; run: %atomic_rmw_umin_i64(-1, -3) == -3

function %atomic_rmw_umin_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 umin v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_umin_i32(0, 0) == 0
; run: %atomic_rmw_umin_i32(1, 0) == 0
; run: %atomic_rmw_umin_i32(0, 1) == 0
; run: %atomic_rmw_umin_i32(1, 1) == 1
; run: %atomic_rmw_umin_i32(-1, 1) == 1
; run: %atomic_rmw_umin_i32(-1, -3) == -3



function %atomic_rmw_umax_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 umax v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_umax_i64(0, 0) == 0
; run: %atomic_rmw_umax_i64(1, 0) == 1
; run: %atomic_rmw_umax_i64(0, 1) == 1
; run: %atomic_rmw_umax_i64(1, 1) == 1
; run: %atomic_rmw_umax_i64(-1, 1) == -1
; run: %atomic_rmw_umax_i64(-1, -3) == -1

function %atomic_rmw_umax_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 umax v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_umax_i32(0, 0) == 0
; run: %atomic_rmw_umax_i32(1, 0) == 1
; run: %atomic_rmw_umax_i32(0, 1) == 1
; run: %atomic_rmw_umax_i32(1, 1) == 1
; run: %atomic_rmw_umax_i32(-1, 1) == -1
; run: %atomic_rmw_umax_i32(-1, -3) == -1



function %atomic_rmw_smin_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 smin v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_smin_i64(0, 0) == 0
; run: %atomic_rmw_smin_i64(1, 0) == 0
; run: %atomic_rmw_smin_i64(0, 1) == 0
; run: %atomic_rmw_smin_i64(1, 1) == 1
; run: %atomic_rmw_smin_i64(-1, 1) == -1
; run: %atomic_rmw_smin_i64(-1, -3) == -3

function %atomic_rmw_smin_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 smin v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_smin_i32(0, 0) == 0
; run: %atomic_rmw_smin_i32(1, 0) == 0
; run: %atomic_rmw_smin_i32(0, 1) == 0
; run: %atomic_rmw_smin_i32(1, 1) == 1
; run: %atomic_rmw_smin_i32(-1, -1) == -1
; run: %atomic_rmw_smin_i32(-1, -3) == -3



function %atomic_rmw_smax_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 smax v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_smax_i64(0, 0) == 0
; run: %atomic_rmw_smax_i64(1, 0) == 1
; run: %atomic_rmw_smax_i64(0, 1) == 1
; run: %atomic_rmw_smax_i64(1, 1) == 1
; run: %atomic_rmw_smax_i64(-1, 1) == 1
; run: %atomic_rmw_smax_i64(-1, -3) == -1

function %atomic_rmw_smax_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 smax v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_smax_i32(0, 0) == 0
; run: %atomic_rmw_smax_i32(1, 0) == 1
; run: %atomic_rmw_smax_i32(0, 1) == 1
; run: %atomic_rmw_smax_i32(1, 1) == 1
; run: %atomic_rmw_smax_i32(-1, 1) == 1
; run: %atomic_rmw_smax_i32(-1, -3) == -1



function %atomic_rmw_xchg_i64(i64, i64) -> i64 {
ss0 = explicit_slot 8

block0(v0: i64, v1: i64):
stack_store.i64 v0, ss0

v2 = stack_addr.i64 ss0
v3 = atomic_rmw.i64 xchg v2, v1

v4 = stack_load.i64 ss0
return v4
}
; run: %atomic_rmw_xchg_i64(0, 0) == 0
; run: %atomic_rmw_xchg_i64(1, 0) == 0
; run: %atomic_rmw_xchg_i64(0, 1) == 1
; run: %atomic_rmw_xchg_i64(0, 0xC0FFEEEE_DECAFFFF) == 0xC0FFEEEE_DECAFFFF

function %atomic_rmw_xchg_i32(i32, i32) -> i32 {
ss0 = explicit_slot 4

block0(v0: i32, v1: i32):
stack_store.i32 v0, ss0

v2 = stack_addr.i32 ss0
v3 = atomic_rmw.i32 xchg v2, v1

v4 = stack_load.i32 ss0
return v4
}
; run: %atomic_rmw_xchg_i32(0, 0) == 0
; run: %atomic_rmw_xchg_i32(1, 0) == 0
; run: %atomic_rmw_xchg_i32(0, 1) == 1
; run: %atomic_rmw_xchg_i32(0, 0xC0FFEEEE) == 0xC0FFEEEE
Loading

0 comments on commit bc6f751

Please sign in to comment.