From 14ee88686a4c3064fd5d996b7818130c3681092a Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sat, 18 Mar 2023 12:03:10 -0700 Subject: [PATCH 1/3] x64: Add AVX encodings of `vcvt{ss2sd,sd2ss}` Additionally update the instruction helpers to take an `XmmMem` argument to allow load sinking into the instruction. --- cranelift/codegen/src/isa/x64/inst.isle | 16 ++- cranelift/codegen/src/isa/x64/inst/args.rs | 4 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 19 ++- .../isa/x64/fpromote-demote-avx.clif | 130 ++++++++++++++++++ .../filetests/isa/x64/fpromote-demote.clif | 130 ++++++++++++++++++ cranelift/foo.clif | 5 + 6 files changed, 296 insertions(+), 8 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif create mode 100644 cranelift/filetests/filetests/isa/x64/fpromote-demote.clif create mode 100644 cranelift/foo.clif diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 1fdc6b25c082..0d79fa629a0e 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1299,6 +1299,8 @@ Vpmovmskb Vcvtsi2ss Vcvtsi2sd + Vcvtss2sd + Vcvtsd2ss )) (type Avx512Opcode extern @@ -4005,12 +4007,18 @@ (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtpd) x)) ;; Helper for creating `cvtss2sd` instructions. -(decl x64_cvtss2sd (Xmm) Xmm) -(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x)) +(decl x64_cvtss2sd (XmmMem) Xmm) +(rule (x64_cvtss2sd x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtss2sd) x)) +(rule 1 (x64_cvtss2sd x) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_vex (AvxOpcode.Vcvtss2sd) x)) ;; Helper for creating `cvtsd2ss` instructions. -(decl x64_cvtsd2ss (Xmm) Xmm) -(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x)) +(decl x64_cvtsd2ss (XmmMem) Xmm) +(rule (x64_cvtsd2ss x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtsd2ss) x)) +(rule 1 (x64_cvtsd2ss x) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_vex (AvxOpcode.Vcvtsd2ss) x)) ;; Helper for creating `cvtdq2ps` instructions. (decl x64_cvtdq2ps (XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index a135fc5af198..14c513e5edd1 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1722,7 +1722,9 @@ impl AvxOpcode { | AvxOpcode::Vmovmskpd | AvxOpcode::Vpmovmskb | AvxOpcode::Vcvtsi2ss - | AvxOpcode::Vcvtsi2sd => { + | AvxOpcode::Vcvtsi2sd + | AvxOpcode::Vcvtss2sd + | AvxOpcode::Vcvtsd2ss => { smallvec![InstructionSet::AVX] } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 2b0f3af084b0..d7a125f52f43 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2405,17 +2405,30 @@ pub(crate) fn emit( AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18), AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12), + AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A), + AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A), + _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; - VexInstruction::new() + let mut vex = VexInstruction::new() .length(VexVectorLength::V128) .prefix(prefix) .map(map) .opcode(opcode) .reg(dst.to_real_reg().unwrap().hw_enc()) - .rm(src) - .encode(sink); + .rm(src); + + // These opcodes take a second operand through `vvvv` which copies + // the upper bits into the destination register. That's not + // reflected in the CLIF instruction, however, since the SSE version + // doesn't have this functionality. Instead just copy whatever + // happens to already be in the destination, which at least is what + // LLVM seems to do. + if let AvxOpcode::Vcvtss2sd | AvxOpcode::Vcvtsd2ss = op { + vex = vex.vvvv(dst.to_real_reg().unwrap().hw_enc()); + } + vex.encode(sink); } Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => { diff --git a/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif b/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif new file mode 100644 index 000000000000..999d9d5ec5a7 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif @@ -0,0 +1,130 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %fpromote(f32) -> f64 { +block0(v0: f32): + v1 = fpromote.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtss2sd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtss2sd %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fpromote_load(i64, f32) -> f64 { + ss0 = explicit_slot 16 + +block0(v1: i64, v2: f32): + v3 = stack_addr.i64 ss0 + store.f32 v2, v3 + v4 = load.f32 v3 + v5 = fpromote.f64 v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; block0: +; lea rsp(0 + virtual offset), %rdx +; vmovss %xmm0, 0(%rdx) +; vcvtss2sd 0(%rdx), %xmm0 +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; block1: ; offset 0x8 +; leaq (%rsp), %rdx +; vmovss %xmm0, (%rdx) ; trap: heap_oob +; vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fdemote(f64) -> f32 { +block0(v0: f64): + v1 = fdemote.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vcvtsd2ss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vcvtsd2ss %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fdemote_load(i64, f64) -> f32 { + ss0 = explicit_slot 16 + +block0(v1: i64, v2: f64): + v3 = stack_addr.i64 ss0 + store.f64 v2, v3 + v4 = load.f64 v3 + v5 = fdemote.f32 v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; block0: +; lea rsp(0 + virtual offset), %rdx +; vmovsd %xmm0, 0(%rdx) +; vcvtsd2ss 0(%rdx), %xmm0 +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; block1: ; offset 0x8 +; leaq (%rsp), %rdx +; vmovsd %xmm0, (%rdx) ; trap: heap_oob +; vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif b/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif new file mode 100644 index 000000000000..3f6cf72e307c --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif @@ -0,0 +1,130 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %fpromote(f32) -> f64 { +block0(v0: f32): + v1 = fpromote.f64 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvtss2sd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; cvtss2sd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fpromote_load(i64, f32) -> f64 { + ss0 = explicit_slot 16 + +block0(v1: i64, v2: f32): + v3 = stack_addr.i64 ss0 + store.f32 v2, v3 + v4 = load.f32 v3 + v5 = fpromote.f64 v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; block0: +; lea rsp(0 + virtual offset), %rdx +; movss %xmm0, 0(%rdx) +; cvtss2sd 0(%rdx), %xmm0 +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; block1: ; offset 0x8 +; leaq (%rsp), %rdx +; movss %xmm0, (%rdx) ; trap: heap_oob +; cvtss2sd (%rdx), %xmm0 ; trap: heap_oob +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fdemote(f64) -> f32 { +block0(v0: f64): + v1 = fdemote.f32 v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; cvtsd2ss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; cvtsd2ss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %fdemote_load(i64, f64) -> f32 { + ss0 = explicit_slot 16 + +block0(v1: i64, v2: f64): + v3 = stack_addr.i64 ss0 + store.f64 v2, v3 + v4 = load.f64 v3 + v5 = fdemote.f32 v4 + return v5 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; subq %rsp, $16, %rsp +; block0: +; lea rsp(0 + virtual offset), %rdx +; movsd %xmm0, 0(%rdx) +; cvtsd2ss 0(%rdx), %xmm0 +; addq %rsp, $16, %rsp +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; subq $0x10, %rsp +; block1: ; offset 0x8 +; leaq (%rsp), %rdx +; movsd %xmm0, (%rdx) ; trap: heap_oob +; cvtsd2ss (%rdx), %xmm0 ; trap: heap_oob +; addq $0x10, %rsp +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/foo.clif b/cranelift/foo.clif new file mode 100644 index 000000000000..e96ed26f90c9 --- /dev/null +++ b/cranelift/foo.clif @@ -0,0 +1,5 @@ +function %sqrt_f32(f32) -> f32 { +block0(v0: f32): + v1 = sqrt v0 + return v1 +} From 22ca7bd37d1e210f1417548af4fedf955b1f0588 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sat, 18 Mar 2023 12:09:33 -0700 Subject: [PATCH 2/3] x64: Add AVX encoding of `sqrts{s,d}` --- cranelift/codegen/src/isa/x64/inst.isle | 8 +++ cranelift/codegen/src/isa/x64/inst/args.rs | 4 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 14 +++-- .../filetests/isa/x64/fsqrt-avx.clif | 54 +++++++++++++++++++ .../filetests/filetests/isa/x64/fsqrt.clif | 54 +++++++++++++++++++ 5 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif create mode 100644 cranelift/filetests/filetests/isa/x64/fsqrt.clif diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index 0d79fa629a0e..d0cfa5821ca0 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1301,6 +1301,8 @@ Vcvtsi2sd Vcvtss2sd Vcvtsd2ss + Vsqrtss + Vsqrtsd )) (type Avx512Opcode extern @@ -3987,10 +3989,16 @@ ;; Helper for creating `sqrtss` instructions. (decl x64_sqrtss (XmmMem) Xmm) (rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x)) +(rule 1 (x64_sqrtss x) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtss) x)) ;; Helper for creating `sqrtsd` instructions. (decl x64_sqrtsd (XmmMem) Xmm) (rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x)) +(rule 1 (x64_sqrtsd x) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtsd) x)) ;; Helper for creating `sqrtps` instructions. (decl x64_sqrtps (XmmMem) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 14c513e5edd1..18a2b2469224 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1724,7 +1724,9 @@ impl AvxOpcode { | AvxOpcode::Vcvtsi2ss | AvxOpcode::Vcvtsi2sd | AvxOpcode::Vcvtss2sd - | AvxOpcode::Vcvtsd2ss => { + | AvxOpcode::Vcvtsd2ss + | AvxOpcode::Vsqrtss + | AvxOpcode::Vsqrtsd => { smallvec![InstructionSet::AVX] } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index d7a125f52f43..a9366a5df998 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2407,11 +2407,13 @@ pub(crate) fn emit( AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A), AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A), + AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51), + AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; - let mut vex = VexInstruction::new() + let vex = VexInstruction::new() .length(VexVectorLength::V128) .prefix(prefix) .map(map) @@ -2425,9 +2427,13 @@ pub(crate) fn emit( // doesn't have this functionality. Instead just copy whatever // happens to already be in the destination, which at least is what // LLVM seems to do. - if let AvxOpcode::Vcvtss2sd | AvxOpcode::Vcvtsd2ss = op { - vex = vex.vvvv(dst.to_real_reg().unwrap().hw_enc()); - } + let vex = match op { + AvxOpcode::Vcvtss2sd + | AvxOpcode::Vcvtsd2ss + | AvxOpcode::Vsqrtss + | AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()), + _ => vex, + }; vex.encode(sink); } diff --git a/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif b/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif new file mode 100644 index 000000000000..35fb832a8c6f --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif @@ -0,0 +1,54 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %sqrt_f32(f32) -> f32 { +block0(v0: f32): + v1 = sqrt v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vsqrtss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vsqrtss %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sqrt_f64(f64) -> f64 { +block0(v0: f64): + v1 = sqrt v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vsqrtsd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vsqrtsd %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/filetests/filetests/isa/x64/fsqrt.clif b/cranelift/filetests/filetests/isa/x64/fsqrt.clif new file mode 100644 index 000000000000..ffe971de14a5 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/fsqrt.clif @@ -0,0 +1,54 @@ +test compile precise-output +set enable_simd +target x86_64 + +function %sqrt_f32(f32) -> f32 { +block0(v0: f32): + v1 = sqrt v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sqrtss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; sqrtss %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %sqrt_f64(f64) -> f64 { +block0(v0: f64): + v1 = sqrt v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; sqrtsd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; sqrtsd %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + From e6bf702c8fc9ca6566935628af51aa7ec7d7fed8 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sat, 18 Mar 2023 12:19:57 -0700 Subject: [PATCH 3/3] x64: Add AVX support for `rounds{s,d}` --- cranelift/codegen/src/isa/x64/inst.isle | 8 ++ cranelift/codegen/src/isa/x64/inst/args.rs | 4 +- cranelift/codegen/src/isa/x64/inst/emit.rs | 17 ++- .../filetests/filetests/isa/x64/ceil-avx.clif | 104 ++++++++++++++++++ cranelift/foo.clif | 5 - 5 files changed, 129 insertions(+), 9 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/x64/ceil-avx.clif delete mode 100644 cranelift/foo.clif diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle index d0cfa5821ca0..1ca8b9f5f841 100644 --- a/cranelift/codegen/src/isa/x64/inst.isle +++ b/cranelift/codegen/src/isa/x64/inst.isle @@ -1303,6 +1303,8 @@ Vcvtsd2ss Vsqrtss Vsqrtsd + Vroundss + Vroundsd )) (type Avx512Opcode extern @@ -3352,11 +3354,17 @@ (decl x64_roundss (XmmMem RoundImm) Xmm) (rule (x64_roundss src1 round) (xmm_unary_rm_r_imm (SseOpcode.Roundss) src1 (encode_round_imm round))) +(rule 1 (x64_roundss src1 round) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundss) src1 (encode_round_imm round))) ;; Helper for creating `roundsd` instructions. (decl x64_roundsd (XmmMem RoundImm) Xmm) (rule (x64_roundsd src1 round) (xmm_unary_rm_r_imm (SseOpcode.Roundsd) src1 (encode_round_imm round))) +(rule 1 (x64_roundsd src1 round) + (if-let $true (use_avx_simd)) + (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundsd) src1 (encode_round_imm round))) ;; Helper for creating `roundps` instructions. (decl x64_roundps (XmmMem RoundImm) Xmm) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 18a2b2469224..b2301f729724 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -1726,7 +1726,9 @@ impl AvxOpcode { | AvxOpcode::Vcvtss2sd | AvxOpcode::Vcvtsd2ss | AvxOpcode::Vsqrtss - | AvxOpcode::Vsqrtsd => { + | AvxOpcode::Vsqrtsd + | AvxOpcode::Vroundss + | AvxOpcode::Vroundsd => { smallvec![InstructionSet::AVX] } diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index a9366a5df998..e2ca40cfe493 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -2452,18 +2452,29 @@ pub(crate) fn emit( AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70), AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70), AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70), + AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A), + AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B), _ => panic!("unexpected rmr_imm_vex opcode {op:?}"), }; - VexInstruction::new() + let vex = VexInstruction::new() .length(VexVectorLength::V128) .prefix(prefix) .map(map) .opcode(opcode) .reg(dst.to_real_reg().unwrap().hw_enc()) .rm(src) - .imm(*imm) - .encode(sink); + .imm(*imm); + + // See comments in similar block above in `XmmUnaryRmRVex` for what + // this is doing. + let vex = match op { + AvxOpcode::Vroundss | AvxOpcode::Vroundsd => { + vex.vvvv(dst.to_real_reg().unwrap().hw_enc()) + } + _ => vex, + }; + vex.encode(sink); } Inst::XmmMovRMVex { op, src, dst } => { diff --git a/cranelift/filetests/filetests/isa/x64/ceil-avx.clif b/cranelift/filetests/filetests/isa/x64/ceil-avx.clif new file mode 100644 index 000000000000..cee651622aef --- /dev/null +++ b/cranelift/filetests/filetests/isa/x64/ceil-avx.clif @@ -0,0 +1,104 @@ +test compile precise-output +set enable_simd +target x86_64 has_avx + +function %f1(f32) -> f32 { +block0(v0: f32): + v1 = ceil v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vroundss $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vroundss $2, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f2(f64) -> f64 { +block0(v0: f64): + v1 = ceil v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vroundsd $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vroundsd $2, %xmm0, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = ceil v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vroundps $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vroundps $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + +function %f4(f64x2) -> f64x2 { +block0(v0: f64x2): + v1 = ceil v0 + return v1 +} + +; VCode: +; pushq %rbp +; movq %rsp, %rbp +; block0: +; vroundpd $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; pushq %rbp +; movq %rsp, %rbp +; block1: ; offset 0x4 +; vroundpd $2, %xmm0, %xmm0 +; movq %rbp, %rsp +; popq %rbp +; retq + diff --git a/cranelift/foo.clif b/cranelift/foo.clif deleted file mode 100644 index e96ed26f90c9..000000000000 --- a/cranelift/foo.clif +++ /dev/null @@ -1,5 +0,0 @@ -function %sqrt_f32(f32) -> f32 { -block0(v0: f32): - v1 = sqrt v0 - return v1 -}