From 14ee88686a4c3064fd5d996b7818130c3681092a Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sat, 18 Mar 2023 12:03:10 -0700
Subject: [PATCH 1/3] x64: Add AVX encodings of `vcvt{ss2sd,sd2ss}`

Additionally update the instruction helpers to take an `XmmMem` argument
to allow load sinking into the instruction.
---
 cranelift/codegen/src/isa/x64/inst.isle       |  16 ++-
 cranelift/codegen/src/isa/x64/inst/args.rs    |   4 +-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  19 ++-
 .../isa/x64/fpromote-demote-avx.clif          | 130 ++++++++++++++++++
 .../filetests/isa/x64/fpromote-demote.clif    | 130 ++++++++++++++++++
 cranelift/foo.clif                            |   5 +
 6 files changed, 296 insertions(+), 8 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/fpromote-demote.clif
 create mode 100644 cranelift/foo.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 1fdc6b25c082..0d79fa629a0e 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1299,6 +1299,8 @@
             Vpmovmskb
             Vcvtsi2ss
             Vcvtsi2sd
+            Vcvtss2sd
+            Vcvtsd2ss
           ))
 
 (type Avx512Opcode extern
@@ -4005,12 +4007,18 @@
         (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtpd) x))
 
 ;; Helper for creating `cvtss2sd` instructions.
-(decl x64_cvtss2sd (Xmm) Xmm)
-(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x))
+(decl x64_cvtss2sd (XmmMem) Xmm)
+(rule (x64_cvtss2sd x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtss2sd) x))
+(rule 1 (x64_cvtss2sd x)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_vex (AvxOpcode.Vcvtss2sd) x))
 
 ;; Helper for creating `cvtsd2ss` instructions.
-(decl x64_cvtsd2ss (Xmm) Xmm)
-(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x))
+(decl x64_cvtsd2ss (XmmMem) Xmm)
+(rule (x64_cvtsd2ss x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtsd2ss) x))
+(rule 1 (x64_cvtsd2ss x)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_vex (AvxOpcode.Vcvtsd2ss) x))
 
 ;; Helper for creating `cvtdq2ps` instructions.
 (decl x64_cvtdq2ps (XmmMem) Xmm)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index a135fc5af198..14c513e5edd1 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1722,7 +1722,9 @@ impl AvxOpcode {
             | AvxOpcode::Vmovmskpd
             | AvxOpcode::Vpmovmskb
             | AvxOpcode::Vcvtsi2ss
-            | AvxOpcode::Vcvtsi2sd => {
+            | AvxOpcode::Vcvtsi2sd
+            | AvxOpcode::Vcvtss2sd
+            | AvxOpcode::Vcvtsd2ss => {
                 smallvec![InstructionSet::AVX]
             }
 
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 2b0f3af084b0..d7a125f52f43 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2405,17 +2405,30 @@ pub(crate) fn emit(
                 AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
                 AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
 
+                AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
+                AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
+
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
-            VexInstruction::new()
+            let mut vex = VexInstruction::new()
                 .length(VexVectorLength::V128)
                 .prefix(prefix)
                 .map(map)
                 .opcode(opcode)
                 .reg(dst.to_real_reg().unwrap().hw_enc())
-                .rm(src)
-                .encode(sink);
+                .rm(src);
+
+            // These opcodes take a second operand through `vvvv` which copies
+            // the upper bits into the destination register. That's not
+            // reflected in the CLIF instruction, however, since the SSE version
+            // doesn't have this functionality. Instead just copy whatever
+            // happens to already be in the destination, which at least is what
+            // LLVM seems to do.
+            if let AvxOpcode::Vcvtss2sd | AvxOpcode::Vcvtsd2ss = op {
+                vex = vex.vvvv(dst.to_real_reg().unwrap().hw_enc());
+            }
+            vex.encode(sink);
         }
 
         Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
diff --git a/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif b/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
new file mode 100644
index 000000000000..999d9d5ec5a7
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote-avx.clif
@@ -0,0 +1,130 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %fpromote(f32) -> f64 {
+block0(v0: f32):
+    v1 = fpromote.f64 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vcvtss2sd %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vcvtss2sd %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fpromote_load(i64, f32) -> f64 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f32):
+    v3 = stack_addr.i64 ss0
+    store.f32 v2, v3
+    v4 = load.f32 v3
+    v5 = fpromote.f64 v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $16, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rdx
+;   vmovss  %xmm0, 0(%rdx)
+;   vcvtss2sd 0(%rdx), %xmm0
+;   addq    %rsp, $16, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+;   subq $0x10, %rsp
+; block1: ; offset 0x8
+;   leaq (%rsp), %rdx
+;   vmovss %xmm0, (%rdx) ; trap: heap_oob
+;   vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob
+;   addq $0x10, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fdemote(f64) -> f32 {
+block0(v0: f64):
+    v1 = fdemote.f32 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vcvtsd2ss %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vcvtsd2ss %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fdemote_load(i64, f64) -> f32 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f64):
+    v3 = stack_addr.i64 ss0
+    store.f64 v2, v3
+    v4 = load.f64 v3
+    v5 = fdemote.f32 v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $16, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rdx
+;   vmovsd  %xmm0, 0(%rdx)
+;   vcvtsd2ss 0(%rdx), %xmm0
+;   addq    %rsp, $16, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+;   subq $0x10, %rsp
+; block1: ; offset 0x8
+;   leaq (%rsp), %rdx
+;   vmovsd %xmm0, (%rdx) ; trap: heap_oob
+;   vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob
+;   addq $0x10, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif b/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif
new file mode 100644
index 000000000000..3f6cf72e307c
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/fpromote-demote.clif
@@ -0,0 +1,130 @@
+test compile precise-output
+set enable_simd
+target x86_64
+
+function %fpromote(f32) -> f64 {
+block0(v0: f32):
+    v1 = fpromote.f64 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvtss2sd %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   cvtss2sd %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fpromote_load(i64, f32) -> f64 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f32):
+    v3 = stack_addr.i64 ss0
+    store.f32 v2, v3
+    v4 = load.f32 v3
+    v5 = fpromote.f64 v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $16, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rdx
+;   movss   %xmm0, 0(%rdx)
+;   cvtss2sd 0(%rdx), %xmm0
+;   addq    %rsp, $16, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+;   subq $0x10, %rsp
+; block1: ; offset 0x8
+;   leaq (%rsp), %rdx
+;   movss %xmm0, (%rdx) ; trap: heap_oob
+;   cvtss2sd (%rdx), %xmm0 ; trap: heap_oob
+;   addq $0x10, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fdemote(f64) -> f32 {
+block0(v0: f64):
+    v1 = fdemote.f32 v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   cvtsd2ss %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   cvtsd2ss %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %fdemote_load(i64, f64) -> f32 {
+    ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f64):
+    v3 = stack_addr.i64 ss0
+    store.f64 v2, v3
+    v4 = load.f64 v3
+    v5 = fdemote.f32 v4
+    return v5
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+;   subq    %rsp, $16, %rsp
+; block0:
+;   lea     rsp(0 + virtual offset), %rdx
+;   movsd   %xmm0, 0(%rdx)
+;   cvtsd2ss 0(%rdx), %xmm0
+;   addq    %rsp, $16, %rsp
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+;   subq $0x10, %rsp
+; block1: ; offset 0x8
+;   leaq (%rsp), %rdx
+;   movsd %xmm0, (%rdx) ; trap: heap_oob
+;   cvtsd2ss (%rdx), %xmm0 ; trap: heap_oob
+;   addq $0x10, %rsp
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/foo.clif b/cranelift/foo.clif
new file mode 100644
index 000000000000..e96ed26f90c9
--- /dev/null
+++ b/cranelift/foo.clif
@@ -0,0 +1,5 @@
+function %sqrt_f32(f32) -> f32 {
+block0(v0: f32):
+    v1 = sqrt v0
+    return v1
+}

From 22ca7bd37d1e210f1417548af4fedf955b1f0588 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sat, 18 Mar 2023 12:09:33 -0700
Subject: [PATCH 2/3] x64: Add AVX encoding of `sqrts{s,d}`

---
 cranelift/codegen/src/isa/x64/inst.isle       |  8 +++
 cranelift/codegen/src/isa/x64/inst/args.rs    |  4 +-
 cranelift/codegen/src/isa/x64/inst/emit.rs    | 14 +++--
 .../filetests/isa/x64/fsqrt-avx.clif          | 54 +++++++++++++++++++
 .../filetests/filetests/isa/x64/fsqrt.clif    | 54 +++++++++++++++++++
 5 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif
 create mode 100644 cranelift/filetests/filetests/isa/x64/fsqrt.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index 0d79fa629a0e..d0cfa5821ca0 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1301,6 +1301,8 @@
             Vcvtsi2sd
             Vcvtss2sd
             Vcvtsd2ss
+            Vsqrtss
+            Vsqrtsd
           ))
 
 (type Avx512Opcode extern
@@ -3987,10 +3989,16 @@
 ;; Helper for creating `sqrtss` instructions.
 (decl x64_sqrtss (XmmMem) Xmm)
 (rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x))
+(rule 1 (x64_sqrtss x)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtss) x))
 
 ;; Helper for creating `sqrtsd` instructions.
 (decl x64_sqrtsd (XmmMem) Xmm)
 (rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x))
+(rule 1 (x64_sqrtsd x)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtsd) x))
 
 ;; Helper for creating `sqrtps` instructions.
 (decl x64_sqrtps (XmmMem) Xmm)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 14c513e5edd1..18a2b2469224 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1724,7 +1724,9 @@ impl AvxOpcode {
             | AvxOpcode::Vcvtsi2ss
             | AvxOpcode::Vcvtsi2sd
             | AvxOpcode::Vcvtss2sd
-            | AvxOpcode::Vcvtsd2ss => {
+            | AvxOpcode::Vcvtsd2ss
+            | AvxOpcode::Vsqrtss
+            | AvxOpcode::Vsqrtsd => {
                 smallvec![InstructionSet::AVX]
             }
 
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index d7a125f52f43..a9366a5df998 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2407,11 +2407,13 @@ pub(crate) fn emit(
 
                 AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
                 AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
+                AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
+                AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),
 
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
-            let mut vex = VexInstruction::new()
+            let vex = VexInstruction::new()
                 .length(VexVectorLength::V128)
                 .prefix(prefix)
                 .map(map)
@@ -2425,9 +2427,13 @@ pub(crate) fn emit(
             // doesn't have this functionality. Instead just copy whatever
             // happens to already be in the destination, which at least is what
             // LLVM seems to do.
-            if let AvxOpcode::Vcvtss2sd | AvxOpcode::Vcvtsd2ss = op {
-                vex = vex.vvvv(dst.to_real_reg().unwrap().hw_enc());
-            }
+            let vex = match op {
+                AvxOpcode::Vcvtss2sd
+                | AvxOpcode::Vcvtsd2ss
+                | AvxOpcode::Vsqrtss
+                | AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
+                _ => vex,
+            };
             vex.encode(sink);
         }
 
diff --git a/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif b/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif
new file mode 100644
index 000000000000..35fb832a8c6f
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/fsqrt-avx.clif
@@ -0,0 +1,54 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %sqrt_f32(f32) -> f32 {
+block0(v0: f32):
+    v1 = sqrt v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vsqrtss %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vsqrtss %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sqrt_f64(f64) -> f64 {
+block0(v0: f64):
+    v1 = sqrt v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vsqrtsd %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vsqrtsd %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/fsqrt.clif b/cranelift/filetests/filetests/isa/x64/fsqrt.clif
new file mode 100644
index 000000000000..ffe971de14a5
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/fsqrt.clif
@@ -0,0 +1,54 @@
+test compile precise-output
+set enable_simd
+target x86_64
+
+function %sqrt_f32(f32) -> f32 {
+block0(v0: f32):
+    v1 = sqrt v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   sqrtss  %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   sqrtss %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %sqrt_f64(f64) -> f64 {
+block0(v0: f64):
+    v1 = sqrt v0
+    return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   sqrtsd  %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   sqrtsd %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+

From e6bf702c8fc9ca6566935628af51aa7ec7d7fed8 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sat, 18 Mar 2023 12:19:57 -0700
Subject: [PATCH 3/3] x64: Add AVX support for `rounds{s,d}`

---
 cranelift/codegen/src/isa/x64/inst.isle       |   8 ++
 cranelift/codegen/src/isa/x64/inst/args.rs    |   4 +-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  17 ++-
 .../filetests/filetests/isa/x64/ceil-avx.clif | 104 ++++++++++++++++++
 cranelift/foo.clif                            |   5 -
 5 files changed, 129 insertions(+), 9 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/x64/ceil-avx.clif
 delete mode 100644 cranelift/foo.clif

diff --git a/cranelift/codegen/src/isa/x64/inst.isle b/cranelift/codegen/src/isa/x64/inst.isle
index d0cfa5821ca0..1ca8b9f5f841 100644
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -1303,6 +1303,8 @@
             Vcvtsd2ss
             Vsqrtss
             Vsqrtsd
+            Vroundss
+            Vroundsd
           ))
 
 (type Avx512Opcode extern
@@ -3352,11 +3354,17 @@
 (decl x64_roundss (XmmMem RoundImm) Xmm)
 (rule (x64_roundss src1 round)
       (xmm_unary_rm_r_imm (SseOpcode.Roundss) src1 (encode_round_imm round)))
+(rule 1 (x64_roundss src1 round)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundss) src1 (encode_round_imm round)))
 
 ;; Helper for creating `roundsd` instructions.
 (decl x64_roundsd (XmmMem RoundImm) Xmm)
 (rule (x64_roundsd src1 round)
       (xmm_unary_rm_r_imm (SseOpcode.Roundsd) src1 (encode_round_imm round)))
+(rule 1 (x64_roundsd src1 round)
+        (if-let $true (use_avx_simd))
+        (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundsd) src1 (encode_round_imm round)))
 
 ;; Helper for creating `roundps` instructions.
 (decl x64_roundps (XmmMem RoundImm) Xmm)
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 18a2b2469224..b2301f729724 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -1726,7 +1726,9 @@ impl AvxOpcode {
             | AvxOpcode::Vcvtss2sd
             | AvxOpcode::Vcvtsd2ss
             | AvxOpcode::Vsqrtss
-            | AvxOpcode::Vsqrtsd => {
+            | AvxOpcode::Vsqrtsd
+            | AvxOpcode::Vroundss
+            | AvxOpcode::Vroundsd => {
                 smallvec![InstructionSet::AVX]
             }
 
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index a9366a5df998..e2ca40cfe493 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -2452,18 +2452,29 @@ pub(crate) fn emit(
                 AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
                 AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
                 AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
+                AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
+                AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
                 _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
             };
 
-            VexInstruction::new()
+            let vex = VexInstruction::new()
                 .length(VexVectorLength::V128)
                 .prefix(prefix)
                 .map(map)
                 .opcode(opcode)
                 .reg(dst.to_real_reg().unwrap().hw_enc())
                 .rm(src)
-                .imm(*imm)
-                .encode(sink);
+                .imm(*imm);
+
+            // See comments in similar block above in `XmmUnaryRmRVex` for what
+            // this is doing.
+            let vex = match op {
+                AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
+                    vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
+                }
+                _ => vex,
+            };
+            vex.encode(sink);
         }
 
         Inst::XmmMovRMVex { op, src, dst } => {
diff --git a/cranelift/filetests/filetests/isa/x64/ceil-avx.clif b/cranelift/filetests/filetests/isa/x64/ceil-avx.clif
new file mode 100644
index 000000000000..cee651622aef
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x64/ceil-avx.clif
@@ -0,0 +1,104 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %f1(f32) -> f32 {
+block0(v0: f32):
+  v1 = ceil v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vroundss $2, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vroundss $2, %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f2(f64) -> f64 {
+block0(v0: f64):
+  v1 = ceil v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vroundsd $2, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vroundsd $2, %xmm0, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+  v1 = ceil v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vroundps $2, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vroundps $2, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
+function %f4(f64x2) -> f64x2 {
+block0(v0: f64x2):
+  v1 = ceil v0
+  return v1
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vroundpd $2, %xmm0, %xmm0
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vroundpd $2, %xmm0, %xmm0
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/foo.clif b/cranelift/foo.clif
deleted file mode 100644
index e96ed26f90c9..000000000000
--- a/cranelift/foo.clif
+++ /dev/null
@@ -1,5 +0,0 @@
-function %sqrt_f32(f32) -> f32 {
-block0(v0: f32):
-    v1 = sqrt v0
-    return v1
-}