bytecodealliance · abrown · Mar 29, 2023 · Mar 18, 2023 · Mar 18, 2023 · Mar 18, 2023
@@ -1299,6 +1299,12 @@
  Vpmovmskb
  Vcvtsi2ss
  Vcvtsi2sd
+ Vcvtss2sd
+ Vcvtsd2ss
+ Vsqrtss
+ Vsqrtsd
+ Vroundss
+ Vroundsd
  ))
 
 (type Avx512Opcode extern
@@ -3348,11 +3354,17 @@
 (decl x64_roundss (XmmMem RoundImm) Xmm)
 (rule (x64_roundss src1 round)
  (xmm_unary_rm_r_imm (SseOpcode.Roundss) src1 (encode_round_imm round)))
+(rule 1 (x64_roundss src1 round)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundss) src1 (encode_round_imm round)))
 
 ;; Helper for creating `roundsd` instructions.
 (decl x64_roundsd (XmmMem RoundImm) Xmm)
 (rule (x64_roundsd src1 round)
  (xmm_unary_rm_r_imm (SseOpcode.Roundsd) src1 (encode_round_imm round)))
+(rule 1 (x64_roundsd src1 round)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_imm_vex (AvxOpcode.Vroundsd) src1 (encode_round_imm round)))
 
 ;; Helper for creating `roundps` instructions.
 (decl x64_roundps (XmmMem RoundImm) Xmm)
@@ -3985,10 +3997,16 @@
 ;; Helper for creating `sqrtss` instructions.
 (decl x64_sqrtss (XmmMem) Xmm)
 (rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x))
+(rule 1 (x64_sqrtss x)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtss) x))
 
 ;; Helper for creating `sqrtsd` instructions.
 (decl x64_sqrtsd (XmmMem) Xmm)
 (rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x))
+(rule 1 (x64_sqrtsd x)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtsd) x))
 
 ;; Helper for creating `sqrtps` instructions.
 (decl x64_sqrtps (XmmMem) Xmm)
@@ -4005,12 +4023,18 @@
  (xmm_unary_rm_r_vex (AvxOpcode.Vsqrtpd) x))
 
 ;; Helper for creating `cvtss2sd` instructions.
-(decl x64_cvtss2sd (Xmm) Xmm)
-(rule (x64_cvtss2sd x) (xmm_unary_rm_r (SseOpcode.Cvtss2sd) x))
+(decl x64_cvtss2sd (XmmMem) Xmm)
+(rule (x64_cvtss2sd x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtss2sd) x))
+(rule 1 (x64_cvtss2sd x)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_vex (AvxOpcode.Vcvtss2sd) x))
 
 ;; Helper for creating `cvtsd2ss` instructions.
-(decl x64_cvtsd2ss (Xmm) Xmm)
-(rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x))
+(decl x64_cvtsd2ss (XmmMem) Xmm)
+(rule (x64_cvtsd2ss x) (xmm_unary_rm_r_unaligned (SseOpcode.Cvtsd2ss) x))
+(rule 1 (x64_cvtsd2ss x)
+ (if-let $true (use_avx_simd))
+ (xmm_unary_rm_r_vex (AvxOpcode.Vcvtsd2ss) x))
 
 ;; Helper for creating `cvtdq2ps` instructions.
 (decl x64_cvtdq2ps (XmmMem) Xmm)

@@ -1722,7 +1722,13 @@ impl AvxOpcode {
  | AvxOpcode::Vmovmskpd
  | AvxOpcode::Vpmovmskb
  | AvxOpcode::Vcvtsi2ss
- | AvxOpcode::Vcvtsi2sd => {
+ | AvxOpcode::Vcvtsi2sd
+ | AvxOpcode::Vcvtss2sd
+ | AvxOpcode::Vcvtsd2ss
+ | AvxOpcode::Vsqrtss
+ | AvxOpcode::Vsqrtsd
+ | AvxOpcode::Vroundss
+ | AvxOpcode::Vroundsd => {
  smallvec![InstructionSet::AVX]
  }
 

@@ -2405,17 +2405,36 @@ pub(crate) fn emit(
  AvxOpcode::Vbroadcastss => (LegacyPrefixes::_66, OpcodeMap::_0F38, 0x18),
  AvxOpcode::Vmovddup => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x12),
 
+ AvxOpcode::Vcvtss2sd => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x5A),
+ AvxOpcode::Vcvtsd2ss => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x5A),
+ AvxOpcode::Vsqrtss => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x51),
+ AvxOpcode::Vsqrtsd => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x51),
+
  _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
  };
 
- VexInstruction::new()
+ let vex = VexInstruction::new()
  .length(VexVectorLength::V128)
  .prefix(prefix)
  .map(map)
  .opcode(opcode)
  .reg(dst.to_real_reg().unwrap().hw_enc())
- .rm(src)
- .encode(sink);
+ .rm(src);
+
+ // These opcodes take a second operand through `vvvv` which copies
+ // the upper bits into the destination register. That's not
+ // reflected in the CLIF instruction, however, since the SSE version
+ // doesn't have this functionality. Instead just copy whatever
+ // happens to already be in the destination, which at least is what
+ // LLVM seems to do.
+ let vex = match op {
+ AvxOpcode::Vcvtss2sd
+ | AvxOpcode::Vcvtsd2ss
+ | AvxOpcode::Vsqrtss
+ | AvxOpcode::Vsqrtsd => vex.vvvv(dst.to_real_reg().unwrap().hw_enc()),
+ _ => vex,
+ };
+ vex.encode(sink);
  }
 
  Inst::XmmUnaryRmRImmVex { op, src, dst, imm } => {
@@ -2433,18 +2452,29 @@ pub(crate) fn emit(
  AvxOpcode::Vpshuflw => (LegacyPrefixes::_F2, OpcodeMap::_0F, 0x70),
  AvxOpcode::Vpshufhw => (LegacyPrefixes::_F3, OpcodeMap::_0F, 0x70),
  AvxOpcode::Vpshufd => (LegacyPrefixes::_66, OpcodeMap::_0F, 0x70),
+ AvxOpcode::Vroundss => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0A),
+ AvxOpcode::Vroundsd => (LegacyPrefixes::_66, OpcodeMap::_0F3A, 0x0B),
  _ => panic!("unexpected rmr_imm_vex opcode {op:?}"),
  };
 
- VexInstruction::new()
+ let vex = VexInstruction::new()
  .length(VexVectorLength::V128)
  .prefix(prefix)
  .map(map)
  .opcode(opcode)
  .reg(dst.to_real_reg().unwrap().hw_enc())
  .rm(src)
- .imm(*imm)
- .encode(sink);
+ .imm(*imm);
+
+ // See comments in similar block above in `XmmUnaryRmRVex` for what
+ // this is doing.
+ let vex = match op {
+ AvxOpcode::Vroundss | AvxOpcode::Vroundsd => {
+ vex.vvvv(dst.to_real_reg().unwrap().hw_enc())
+ }
+ _ => vex,
+ };
+ vex.encode(sink);
  }
 
  Inst::XmmMovRMVex { op, src, dst } => {

@@ -0,0 +1,104 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %f1(f32) -> f32 {
+block0(v0: f32):
+ v1 = ceil v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vroundss $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vroundss $2, %xmm0, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %f2(f64) -> f64 {
+block0(v0: f64):
+ v1 = ceil v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vroundsd $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vroundsd $2, %xmm0, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %f4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+ v1 = ceil v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vroundps $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vroundps $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %f4(f64x2) -> f64x2 {
+block0(v0: f64x2):
+ v1 = ceil v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vroundpd $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vroundpd $2, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
@@ -0,0 +1,130 @@
+test compile precise-output
+set enable_simd
+target x86_64 has_avx
+
+function %fpromote(f32) -> f64 {
+block0(v0: f32):
+ v1 = fpromote.f64 v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vcvtss2sd %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vcvtss2sd %xmm0, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %fpromote_load(i64, f32) -> f64 {
+ ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f32):
+ v3 = stack_addr.i64 ss0
+ store.f32 v2, v3
+ v4 = load.f32 v3
+ v5 = fpromote.f64 v4
+ return v5
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; subq %rsp, $16, %rsp
+; block0:
+; lea rsp(0 + virtual offset), %rdx
+; vmovss %xmm0, 0(%rdx)
+; vcvtss2sd 0(%rdx), %xmm0
+; addq %rsp, $16, %rsp
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; subq $0x10, %rsp
+; block1: ; offset 0x8
+; leaq (%rsp), %rdx
+; vmovss %xmm0, (%rdx) ; trap: heap_oob
+; vcvtss2sd (%rdx), %xmm0, %xmm0 ; trap: heap_oob
+; addq $0x10, %rsp
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %fdemote(f64) -> f32 {
+block0(v0: f64):
+ v1 = fdemote.f32 v0
+ return v1
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; block0:
+; vcvtsd2ss %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; block1: ; offset 0x4
+; vcvtsd2ss %xmm0, %xmm0, %xmm0
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+
+function %fdemote_load(i64, f64) -> f32 {
+ ss0 = explicit_slot 16
+
+block0(v1: i64, v2: f64):
+ v3 = stack_addr.i64 ss0
+ store.f64 v2, v3
+ v4 = load.f64 v3
+ v5 = fdemote.f32 v4
+ return v5
+}
+
+; VCode:
+; pushq %rbp
+; movq %rsp, %rbp
+; subq %rsp, $16, %rsp
+; block0:
+; lea rsp(0 + virtual offset), %rdx
+; vmovsd %xmm0, 0(%rdx)
+; vcvtsd2ss 0(%rdx), %xmm0
+; addq %rsp, $16, %rsp
+; movq %rbp, %rsp
+; popq %rbp
+; ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+; pushq %rbp
+; movq %rsp, %rbp
+; subq $0x10, %rsp
+; block1: ; offset 0x8
+; leaq (%rsp), %rdx
+; vmovsd %xmm0, (%rdx) ; trap: heap_oob
+; vcvtsd2ss (%rdx), %xmm0, %xmm0 ; trap: heap_oob
+; addq $0x10, %rsp
+; movq %rbp, %rsp
+; popq %rbp
+; retq
+