Fix another libcall popping up with simd (#10735)

alexcrichton · web-flow · commit 402a6a5526ca · 2025-05-06T16:42:29.000Z
This commit is similar to #10699, another instance of a libcall popping up late in the x64 backend. Fuzzing found this issue and to help verify this is the last one I've run the whole `*.wast` test suite with the x86_64 baseline (no target features) and saw the panic before this PR and no more panics after.
diff --git a/crates/cranelift/src/translate/code_translator.rs b/crates/cranelift/src/translate/code_translator.rs
@@ -2041,11 +2041,26 @@ pub fn translate_operator(
         }
         Operator::I32x4RelaxedTruncF64x2UZero | Operator::I32x4TruncSatF64x2UZero => {
             let a = pop1_with_bitcast(state, F64X2, builder);
-            let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
-            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
-            let zero = builder.ins().vconst(I64X2, handle);
-
-            state.push1(builder.ins().uunarrow(converted_a, zero));
+            let zero_constant = builder.func.dfg.constants.insert(vec![0u8; 16].into());
+            let result = if environ.is_x86() && !environ.isa().has_round() {
+                // On x86 the vector lowering for `fcvt_to_uint_sat` requires
+                // SSE4.1 `round` instructions. If SSE4.1 isn't available it
+                // falls back to a libcall which we don't want in Wasmtime.
+                // Handle this by falling back to the scalar implementation
+                // which does not require SSE4.1 instructions.
+                let lane0 = builder.ins().extractlane(a, 0);
+                let lane1 = builder.ins().extractlane(a, 1);
+                let lane0_rounded = builder.ins().fcvt_to_uint_sat(I32, lane0);
+                let lane1_rounded = builder.ins().fcvt_to_uint_sat(I32, lane1);
+                let result = builder.ins().vconst(I32X4, zero_constant);
+                let result = builder.ins().insertlane(result, lane0_rounded, 0);
+                builder.ins().insertlane(result, lane1_rounded, 1)
+            } else {
+                let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
+                let zero = builder.ins().vconst(I64X2, zero_constant);
+                builder.ins().uunarrow(converted_a, zero)
+            };
+            state.push1(result);
         }
 
         Operator::I8x16NarrowI16x8S => {
diff --git a/tests/disas/x64-simd-round-without-see41.wat b/tests/disas/x64-simd-round-without-see41.wat
@@ -9,6 +9,7 @@
   (func $f64x2.floor (param v128) (result v128) (f64x2.floor (local.get 0)))
   (func $f64x2.trunc (param v128) (result v128) (f64x2.trunc (local.get 0)))
   (func $f64x2.nearest (param v128) (result v128) (f64x2.nearest (local.get 0)))
+  (func $i32x4.trunc_sat_f64x2_u_zero (param v128) (result v128) (i32x4.trunc_sat_f64x2_u_zero (local.get 0)))
 )
 ;; function u0:0(i64 vmctx, i64, i8x16) -> i8x16 tail {
 ;;     gv0 = vmctx
@@ -21,25 +22,25 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0022                               v4 = bitcast.f32x4 little v2
-;; @0022                               v6 = vconst.f32x4 const0
-;; @0022                               v7 = extractlane v4, 0
-;; @0022                               v8 = call fn0(v0, v7)
-;; @0022                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0022                               v10 = extractlane v4, 1
-;; @0022                               v11 = call fn0(v0, v10)
-;; @0022                               v12 = insertlane v9, v11, 1
-;; @0022                               v13 = extractlane v4, 2
-;; @0022                               v14 = call fn0(v0, v13)
-;; @0022                               v15 = insertlane v12, v14, 2
-;; @0022                               v16 = extractlane v4, 3
-;; @0022                               v17 = call fn0(v0, v16)
-;; @0022                               v18 = insertlane v15, v17, 3
-;; @0024                               v19 = bitcast.i8x16 little v18
-;; @0024                               jump block1
+;; @0023                               v4 = bitcast.f32x4 little v2
+;; @0023                               v6 = vconst.f32x4 const0
+;; @0023                               v7 = extractlane v4, 0
+;; @0023                               v8 = call fn0(v0, v7)
+;; @0023                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @0023                               v10 = extractlane v4, 1
+;; @0023                               v11 = call fn0(v0, v10)
+;; @0023                               v12 = insertlane v9, v11, 1
+;; @0023                               v13 = extractlane v4, 2
+;; @0023                               v14 = call fn0(v0, v13)
+;; @0023                               v15 = insertlane v12, v14, 2
+;; @0023                               v16 = extractlane v4, 3
+;; @0023                               v17 = call fn0(v0, v16)
+;; @0023                               v18 = insertlane v15, v17, 3
+;; @0025                               v19 = bitcast.i8x16 little v18
+;; @0025                               jump block1
 ;;
 ;;                                 block1:
-;; @0024                               return v19
+;; @0025                               return v19
 ;; }
 ;;
 ;; function u0:1(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -53,25 +54,25 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0029                               v4 = bitcast.f32x4 little v2
-;; @0029                               v6 = vconst.f32x4 const0
-;; @0029                               v7 = extractlane v4, 0
-;; @0029                               v8 = call fn0(v0, v7)
-;; @0029                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0029                               v10 = extractlane v4, 1
-;; @0029                               v11 = call fn0(v0, v10)
-;; @0029                               v12 = insertlane v9, v11, 1
-;; @0029                               v13 = extractlane v4, 2
-;; @0029                               v14 = call fn0(v0, v13)
-;; @0029                               v15 = insertlane v12, v14, 2
-;; @0029                               v16 = extractlane v4, 3
-;; @0029                               v17 = call fn0(v0, v16)
-;; @0029                               v18 = insertlane v15, v17, 3
-;; @002b                               v19 = bitcast.i8x16 little v18
-;; @002b                               jump block1
+;; @002a                               v4 = bitcast.f32x4 little v2
+;; @002a                               v6 = vconst.f32x4 const0
+;; @002a                               v7 = extractlane v4, 0
+;; @002a                               v8 = call fn0(v0, v7)
+;; @002a                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @002a                               v10 = extractlane v4, 1
+;; @002a                               v11 = call fn0(v0, v10)
+;; @002a                               v12 = insertlane v9, v11, 1
+;; @002a                               v13 = extractlane v4, 2
+;; @002a                               v14 = call fn0(v0, v13)
+;; @002a                               v15 = insertlane v12, v14, 2
+;; @002a                               v16 = extractlane v4, 3
+;; @002a                               v17 = call fn0(v0, v16)
+;; @002a                               v18 = insertlane v15, v17, 3
+;; @002c                               v19 = bitcast.i8x16 little v18
+;; @002c                               jump block1
 ;;
 ;;                                 block1:
-;; @002b                               return v19
+;; @002c                               return v19
 ;; }
 ;;
 ;; function u0:2(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -85,25 +86,25 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0030                               v4 = bitcast.f32x4 little v2
-;; @0030                               v6 = vconst.f32x4 const0
-;; @0030                               v7 = extractlane v4, 0
-;; @0030                               v8 = call fn0(v0, v7)
-;; @0030                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0030                               v10 = extractlane v4, 1
-;; @0030                               v11 = call fn0(v0, v10)
-;; @0030                               v12 = insertlane v9, v11, 1
-;; @0030                               v13 = extractlane v4, 2
-;; @0030                               v14 = call fn0(v0, v13)
-;; @0030                               v15 = insertlane v12, v14, 2
-;; @0030                               v16 = extractlane v4, 3
-;; @0030                               v17 = call fn0(v0, v16)
-;; @0030                               v18 = insertlane v15, v17, 3
-;; @0032                               v19 = bitcast.i8x16 little v18
-;; @0032                               jump block1
+;; @0031                               v4 = bitcast.f32x4 little v2
+;; @0031                               v6 = vconst.f32x4 const0
+;; @0031                               v7 = extractlane v4, 0
+;; @0031                               v8 = call fn0(v0, v7)
+;; @0031                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @0031                               v10 = extractlane v4, 1
+;; @0031                               v11 = call fn0(v0, v10)
+;; @0031                               v12 = insertlane v9, v11, 1
+;; @0031                               v13 = extractlane v4, 2
+;; @0031                               v14 = call fn0(v0, v13)
+;; @0031                               v15 = insertlane v12, v14, 2
+;; @0031                               v16 = extractlane v4, 3
+;; @0031                               v17 = call fn0(v0, v16)
+;; @0031                               v18 = insertlane v15, v17, 3
+;; @0033                               v19 = bitcast.i8x16 little v18
+;; @0033                               jump block1
 ;;
 ;;                                 block1:
-;; @0032                               return v19
+;; @0033                               return v19
 ;; }
 ;;
 ;; function u0:3(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -117,25 +118,25 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0037                               v4 = bitcast.f32x4 little v2
-;; @0037                               v6 = vconst.f32x4 const0
-;; @0037                               v7 = extractlane v4, 0
-;; @0037                               v8 = call fn0(v0, v7)
-;; @0037                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0037                               v10 = extractlane v4, 1
-;; @0037                               v11 = call fn0(v0, v10)
-;; @0037                               v12 = insertlane v9, v11, 1
-;; @0037                               v13 = extractlane v4, 2
-;; @0037                               v14 = call fn0(v0, v13)
-;; @0037                               v15 = insertlane v12, v14, 2
-;; @0037                               v16 = extractlane v4, 3
-;; @0037                               v17 = call fn0(v0, v16)
-;; @0037                               v18 = insertlane v15, v17, 3
-;; @0039                               v19 = bitcast.i8x16 little v18
-;; @0039                               jump block1
+;; @0038                               v4 = bitcast.f32x4 little v2
+;; @0038                               v6 = vconst.f32x4 const0
+;; @0038                               v7 = extractlane v4, 0
+;; @0038                               v8 = call fn0(v0, v7)
+;; @0038                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @0038                               v10 = extractlane v4, 1
+;; @0038                               v11 = call fn0(v0, v10)
+;; @0038                               v12 = insertlane v9, v11, 1
+;; @0038                               v13 = extractlane v4, 2
+;; @0038                               v14 = call fn0(v0, v13)
+;; @0038                               v15 = insertlane v12, v14, 2
+;; @0038                               v16 = extractlane v4, 3
+;; @0038                               v17 = call fn0(v0, v16)
+;; @0038                               v18 = insertlane v15, v17, 3
+;; @003a                               v19 = bitcast.i8x16 little v18
+;; @003a                               jump block1
 ;;
 ;;                                 block1:
-;; @0039                               return v19
+;; @003a                               return v19
 ;; }
 ;;
 ;; function u0:4(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -149,19 +150,19 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @003e                               v4 = bitcast.f64x2 little v2
-;; @003e                               v6 = vconst.f64x2 const0
-;; @003e                               v7 = extractlane v4, 0
-;; @003e                               v8 = call fn0(v0, v7)
-;; @003e                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @003e                               v10 = extractlane v4, 1
-;; @003e                               v11 = call fn0(v0, v10)
-;; @003e                               v12 = insertlane v9, v11, 1
-;; @0040                               v13 = bitcast.i8x16 little v12
-;; @0040                               jump block1
+;; @003f                               v4 = bitcast.f64x2 little v2
+;; @003f                               v6 = vconst.f64x2 const0
+;; @003f                               v7 = extractlane v4, 0
+;; @003f                               v8 = call fn0(v0, v7)
+;; @003f                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @003f                               v10 = extractlane v4, 1
+;; @003f                               v11 = call fn0(v0, v10)
+;; @003f                               v12 = insertlane v9, v11, 1
+;; @0041                               v13 = bitcast.i8x16 little v12
+;; @0041                               jump block1
 ;;
 ;;                                 block1:
-;; @0040                               return v13
+;; @0041                               return v13
 ;; }
 ;;
 ;; function u0:5(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -175,19 +176,19 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0045                               v4 = bitcast.f64x2 little v2
-;; @0045                               v6 = vconst.f64x2 const0
-;; @0045                               v7 = extractlane v4, 0
-;; @0045                               v8 = call fn0(v0, v7)
-;; @0045                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0045                               v10 = extractlane v4, 1
-;; @0045                               v11 = call fn0(v0, v10)
-;; @0045                               v12 = insertlane v9, v11, 1
-;; @0047                               v13 = bitcast.i8x16 little v12
-;; @0047                               jump block1
+;; @0046                               v4 = bitcast.f64x2 little v2
+;; @0046                               v6 = vconst.f64x2 const0
+;; @0046                               v7 = extractlane v4, 0
+;; @0046                               v8 = call fn0(v0, v7)
+;; @0046                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @0046                               v10 = extractlane v4, 1
+;; @0046                               v11 = call fn0(v0, v10)
+;; @0046                               v12 = insertlane v9, v11, 1
+;; @0048                               v13 = bitcast.i8x16 little v12
+;; @0048                               jump block1
 ;;
 ;;                                 block1:
-;; @0047                               return v13
+;; @0048                               return v13
 ;; }
 ;;
 ;; function u0:6(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -201,19 +202,19 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @004c                               v4 = bitcast.f64x2 little v2
-;; @004c                               v6 = vconst.f64x2 const0
-;; @004c                               v7 = extractlane v4, 0
-;; @004c                               v8 = call fn0(v0, v7)
-;; @004c                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @004c                               v10 = extractlane v4, 1
-;; @004c                               v11 = call fn0(v0, v10)
-;; @004c                               v12 = insertlane v9, v11, 1
-;; @004e                               v13 = bitcast.i8x16 little v12
-;; @004e                               jump block1
+;; @004d                               v4 = bitcast.f64x2 little v2
+;; @004d                               v6 = vconst.f64x2 const0
+;; @004d                               v7 = extractlane v4, 0
+;; @004d                               v8 = call fn0(v0, v7)
+;; @004d                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @004d                               v10 = extractlane v4, 1
+;; @004d                               v11 = call fn0(v0, v10)
+;; @004d                               v12 = insertlane v9, v11, 1
+;; @004f                               v13 = bitcast.i8x16 little v12
+;; @004f                               jump block1
 ;;
 ;;                                 block1:
-;; @004e                               return v13
+;; @004f                               return v13
 ;; }
 ;;
 ;; function u0:7(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -227,17 +228,40 @@
 ;;     stack_limit = gv2
 ;;
 ;;                                 block0(v0: i64, v1: i64, v2: i8x16):
-;; @0053                               v4 = bitcast.f64x2 little v2
-;; @0053                               v6 = vconst.f64x2 const0
-;; @0053                               v7 = extractlane v4, 0
-;; @0053                               v8 = call fn0(v0, v7)
-;; @0053                               v9 = insertlane v6, v8, 0  ; v6 = const0
-;; @0053                               v10 = extractlane v4, 1
-;; @0053                               v11 = call fn0(v0, v10)
-;; @0053                               v12 = insertlane v9, v11, 1
-;; @0056                               v13 = bitcast.i8x16 little v12
-;; @0056                               jump block1
+;; @0054                               v4 = bitcast.f64x2 little v2
+;; @0054                               v6 = vconst.f64x2 const0
+;; @0054                               v7 = extractlane v4, 0
+;; @0054                               v8 = call fn0(v0, v7)
+;; @0054                               v9 = insertlane v6, v8, 0  ; v6 = const0
+;; @0054                               v10 = extractlane v4, 1
+;; @0054                               v11 = call fn0(v0, v10)
+;; @0054                               v12 = insertlane v9, v11, 1
+;; @0057                               v13 = bitcast.i8x16 little v12
+;; @0057                               jump block1
 ;;
 ;;                                 block1:
-;; @0056                               return v13
+;; @0057                               return v13
+;; }
+;;
+;; function u0:8(i64 vmctx, i64, i8x16) -> i8x16 tail {
+;;     gv0 = vmctx
+;;     gv1 = load.i64 notrap aligned readonly gv0+8
+;;     gv2 = load.i64 notrap aligned gv1+16
+;;     const0 = 0x00000000000000000000000000000000
+;;     stack_limit = gv2
+;;
+;;                                 block0(v0: i64, v1: i64, v2: i8x16):
+;; @005c                               v4 = bitcast.f64x2 little v2
+;; @005c                               v5 = extractlane v4, 0
+;; @005c                               v6 = extractlane v4, 1
+;; @005c                               v7 = fcvt_to_uint_sat.i32 v5
+;; @005c                               v8 = fcvt_to_uint_sat.i32 v6
+;; @005c                               v9 = vconst.i32x4 const0
+;; @005c                               v10 = insertlane v9, v7, 0  ; v9 = const0
+;; @005c                               v11 = insertlane v10, v8, 1
+;; @005f                               v12 = bitcast.i8x16 little v11
+;; @005f                               jump block1
+;;
+;;                                 block1:
+;; @005f                               return v12
 ;; }