[33.0.0] Backport some fixes from main (#10743)

alexcrichton · adambratschikaye · web-flow · commit ba64cd5913e1 · 2025-05-07T13:21:48.000-05:00
* Skip a test with threads on ASAN (#10728) It's expected that this has memory leaks, there's no clean shutdown in the CLI right now. * Fix missing libcalls with simd float rounding (#10699) This commit fixes some more fallout found on oss-fuzz about the x64 generating rounding builtins when it shouldn't be. This situation is caused by simd float rounding instructions which the x64 backend lowers to libcall-per-element and now needs to move that logic to the frontend instead. * Fix another libcall popping up with simd (#10735) This commit is similar to #10699, another instance of a libcall popping up late in the x64 backend. Fuzzing found this issue and to help verify this is the last one I've run the whole `*.wast` test suite with the x86_64 baseline (no target features) and saw the panic before this PR and no more panics after. * Inline assembler-x64 `generated_files` in `main.rs` (#10739) The public function `generated_files` in `cranelift-assembler-x64` makes the generated `rlib` non-deterministic because it contains the full paths of generated files. But this function is only used in `main.rs` of the same crate, so this change inlines it there to keep the library artifact deterministic while maintaining the same behavior. --------- Co-authored-by: Adam Bratschi-Kaye <adam.bratschikaye@dfinity.org>
diff --git a/cranelift/assembler-x64/src/lib.rs b/cranelift/assembler-x64/src/lib.rs
@@ -86,8 +86,3 @@ pub use mem::{
 };
 pub use rex::RexFlags;
 pub use xmm::Xmm;
-
-/// List the files generated to create this assembler.
-pub fn generated_files() -> Vec<std::path::PathBuf> {
-    include!(concat!(env!("OUT_DIR"), "/generated-files.rs"))
-}
diff --git a/cranelift/assembler-x64/src/main.rs b/cranelift/assembler-x64/src/main.rs
@@ -1,7 +1,8 @@
 //! Print the path to the generated code.
 
 fn main() {
-    for path in cranelift_assembler_x64::generated_files() {
+    let paths: Vec<std::path::PathBuf> = include!(concat!(env!("OUT_DIR"), "/generated-files.rs"));
+    for path in paths {
         println!("{}", path.display());
     }
 }
diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
@@ -8,15 +8,15 @@ use crate::translate::{
 use crate::{BuiltinFunctionSignatures, TRAP_INTERNAL_ASSERT};
 use cranelift_codegen::cursor::FuncCursor;
 use cranelift_codegen::ir::condcodes::{FloatCC, IntCC};
-use cranelift_codegen::ir::immediates::{Imm64, Offset32};
+use cranelift_codegen::ir::immediates::{Imm64, Offset32, V128Imm};
 use cranelift_codegen::ir::pcc::Fact;
 use cranelift_codegen::ir::types::*;
 use cranelift_codegen::ir::{self, types};
 use cranelift_codegen::ir::{ArgumentPurpose, ConstantData, Function, InstBuilder, MemFlags};
 use cranelift_codegen::isa::{TargetFrontendConfig, TargetIsa};
 use cranelift_entity::{EntityRef, PrimaryMap, SecondaryMap};
-use cranelift_frontend::FunctionBuilder;
 use cranelift_frontend::Variable;
+use cranelift_frontend::{FuncInstBuilder, FunctionBuilder};
 use smallvec::SmallVec;
 use std::mem;
 use wasmparser::{Operator, WasmFeatures};
@@ -3319,103 +3319,193 @@ impl FuncEnvironment<'_> {
         let _ = (builder, num_pages, mem_index);
     }
 
-    pub fn ceil_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // If the ISA has rounding instructions, let Cranelift use them. But if
-        // not, lower to a libcall here, rather than having Cranelift do it. We
-        // can pass our libcall the vmctx pointer, which we use for stack
-        // overflow checking.
+    /// If the ISA has rounding instructions, let Cranelift use them. But if
+    /// not, lower to a libcall here, rather than having Cranelift do it. We
+    /// can pass our libcall the vmctx pointer, which we use for stack
+    /// overflow checking.
+    ///
+    /// This helper is generic for all rounding instructions below, both for
+    /// scalar and simd types. The `clif_round` argument is the CLIF-level
+    /// rounding instruction to use if the ISA has the instruction, and the
+    /// `round_builtin` helper is used to determine which element-level
+    /// rounding operation builtin is used. Note that this handles the case
+    /// when `value` is a vector by doing an element-wise libcall invocation.
+    fn isa_round(
+        &mut self,
+        builder: &mut FunctionBuilder,
+        value: ir::Value,
+        clif_round: fn(FuncInstBuilder<'_, '_>, ir::Value) -> ir::Value,
+        round_builtin: fn(&mut BuiltinFunctions, &mut Function) -> ir::FuncRef,
+    ) -> ir::Value {
         if self.isa.has_round() {
-            builder.ins().ceil(value)
-        } else {
-            let ceil = self.builtin_functions.ceil_f32(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(ceil, &[vmctx, value]);
+            return clif_round(builder.ins(), value);
+        }
+
+        let vmctx = self.vmctx_val(&mut builder.cursor());
+        let round = round_builtin(&mut self.builtin_functions, builder.func);
+        let round_one = |builder: &mut FunctionBuilder, value: ir::Value| {
+            let call = builder.ins().call(round, &[vmctx, value]);
             *builder.func.dfg.inst_results(call).first().unwrap()
+        };
+
+        let ty = builder.func.dfg.value_type(value);
+        if !ty.is_vector() {
+            return round_one(builder, value);
+        }
+
+        assert_eq!(ty.bits(), 128);
+        let zero = builder.func.dfg.constants.insert(V128Imm([0; 16]).into());
+        let mut result = builder.ins().vconst(ty, zero);
+        for i in 0..u8::try_from(ty.lane_count()).unwrap() {
+            let element = builder.ins().extractlane(value, i);
+            let element_rounded = round_one(builder, element);
+            result = builder.ins().insertlane(result, element_rounded, i);
         }
+        result
+    }
+
+    pub fn ceil_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.ceil(val),
+            BuiltinFunctions::ceil_f32,
+        )
     }
 
     pub fn ceil_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().ceil(value)
-        } else {
-            let ceil = self.builtin_functions.ceil_f64(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(ceil, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.ceil(val),
+            BuiltinFunctions::ceil_f64,
+        )
+    }
+
+    pub fn ceil_f32x4(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.ceil(val),
+            BuiltinFunctions::ceil_f32,
+        )
+    }
+
+    pub fn ceil_f64x2(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.ceil(val),
+            BuiltinFunctions::ceil_f64,
+        )
     }
 
     pub fn floor_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().floor(value)
-        } else {
-            let floor = self.builtin_functions.floor_f32(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(floor, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.floor(val),
+            BuiltinFunctions::floor_f32,
+        )
     }
 
     pub fn floor_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().floor(value)
-        } else {
-            let floor = self.builtin_functions.floor_f64(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(floor, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.floor(val),
+            BuiltinFunctions::floor_f64,
+        )
+    }
+
+    pub fn floor_f32x4(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.floor(val),
+            BuiltinFunctions::floor_f32,
+        )
+    }
+
+    pub fn floor_f64x2(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.floor(val),
+            BuiltinFunctions::floor_f64,
+        )
     }
 
     pub fn trunc_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().trunc(value)
-        } else {
-            let trunc = self.builtin_functions.trunc_f32(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(trunc, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.trunc(val),
+            BuiltinFunctions::trunc_f32,
+        )
     }
 
     pub fn trunc_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().trunc(value)
-        } else {
-            let trunc = self.builtin_functions.trunc_f64(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(trunc, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.trunc(val),
+            BuiltinFunctions::trunc_f64,
+        )
+    }
+
+    pub fn trunc_f32x4(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.trunc(val),
+            BuiltinFunctions::trunc_f32,
+        )
+    }
+
+    pub fn trunc_f64x2(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.trunc(val),
+            BuiltinFunctions::trunc_f64,
+        )
     }
 
     pub fn nearest_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().nearest(value)
-        } else {
-            let nearest = self.builtin_functions.nearest_f32(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(nearest, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.nearest(val),
+            BuiltinFunctions::nearest_f32,
+        )
     }
 
     pub fn nearest_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
-        // See the comments in `ceil_f32` about libcalls.
-        if self.isa.has_round() {
-            builder.ins().nearest(value)
-        } else {
-            let nearest = self.builtin_functions.nearest_f64(builder.func);
-            let vmctx = self.vmctx_val(&mut builder.cursor());
-            let call = builder.ins().call(nearest, &[vmctx, value]);
-            *builder.func.dfg.inst_results(call).first().unwrap()
-        }
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.nearest(val),
+            BuiltinFunctions::nearest_f64,
+        )
+    }
+
+    pub fn nearest_f32x4(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.nearest(val),
+            BuiltinFunctions::nearest_f32,
+        )
+    }
+
+    pub fn nearest_f64x2(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
+        self.isa_round(
+            builder,
+            value,
+            |ins, val| ins.nearest(val),
+            BuiltinFunctions::nearest_f64,
+        )
     }
 
     pub fn swizzle(
diff --git a/crates/cranelift/src/translate/code_translator.rs b/crates/cranelift/src/translate/code_translator.rs
@@ -2041,11 +2041,26 @@ pub fn translate_operator(
         }
         Operator::I32x4RelaxedTruncF64x2UZero | Operator::I32x4TruncSatF64x2UZero => {
             let a = pop1_with_bitcast(state, F64X2, builder);
-            let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
-            let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
-            let zero = builder.ins().vconst(I64X2, handle);
-
-            state.push1(builder.ins().uunarrow(converted_a, zero));
+            let zero_constant = builder.func.dfg.constants.insert(vec![0u8; 16].into());
+            let result = if environ.is_x86() && !environ.isa().has_round() {
+                // On x86 the vector lowering for `fcvt_to_uint_sat` requires
+                // SSE4.1 `round` instructions. If SSE4.1 isn't available it
+                // falls back to a libcall which we don't want in Wasmtime.
+                // Handle this by falling back to the scalar implementation
+                // which does not require SSE4.1 instructions.
+                let lane0 = builder.ins().extractlane(a, 0);
+                let lane1 = builder.ins().extractlane(a, 1);
+                let lane0_rounded = builder.ins().fcvt_to_uint_sat(I32, lane0);
+                let lane1_rounded = builder.ins().fcvt_to_uint_sat(I32, lane1);
+                let result = builder.ins().vconst(I32X4, zero_constant);
+                let result = builder.ins().insertlane(result, lane0_rounded, 0);
+                builder.ins().insertlane(result, lane1_rounded, 1)
+            } else {
+                let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
+                let zero = builder.ins().vconst(I64X2, zero_constant);
+                builder.ins().uunarrow(converted_a, zero)
+            };
+            state.push1(result);
         }
 
         Operator::I8x16NarrowI16x8S => {
@@ -2136,24 +2151,37 @@ pub fn translate_operator(
             let widen_high = builder.ins().uwiden_high(a);
             state.push1(builder.ins().iadd_pairwise(widen_low, widen_high));
         }
-        Operator::F32x4Ceil | Operator::F64x2Ceil => {
-            // This is something of a misuse of `type_of`, because that produces the return type
-            // of `op`.  In this case we want the arg type, but we know it's the same as the
-            // return type.  Same for the 3 cases below.
-            let arg = pop1_with_bitcast(state, type_of(op), builder);
-            state.push1(builder.ins().ceil(arg));
+        Operator::F32x4Ceil => {
+            let arg = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(environ.ceil_f32x4(builder, arg));
         }
-        Operator::F32x4Floor | Operator::F64x2Floor => {
-            let arg = pop1_with_bitcast(state, type_of(op), builder);
-            state.push1(builder.ins().floor(arg));
+        Operator::F64x2Ceil => {
+            let arg = pop1_with_bitcast(state, F64X2, builder);
+            state.push1(environ.ceil_f64x2(builder, arg));
         }
-        Operator::F32x4Trunc | Operator::F64x2Trunc => {
-            let arg = pop1_with_bitcast(state, type_of(op), builder);
-            state.push1(builder.ins().trunc(arg));
+        Operator::F32x4Floor => {
+            let arg = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(environ.floor_f32x4(builder, arg));
         }
-        Operator::F32x4Nearest | Operator::F64x2Nearest => {
-            let arg = pop1_with_bitcast(state, type_of(op), builder);
-            state.push1(builder.ins().nearest(arg));
+        Operator::F64x2Floor => {
+            let arg = pop1_with_bitcast(state, F64X2, builder);
+            state.push1(environ.floor_f64x2(builder, arg));
+        }
+        Operator::F32x4Trunc => {
+            let arg = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(environ.trunc_f32x4(builder, arg));
+        }
+        Operator::F64x2Trunc => {
+            let arg = pop1_with_bitcast(state, F64X2, builder);
+            state.push1(environ.trunc_f64x2(builder, arg));
+        }
+        Operator::F32x4Nearest => {
+            let arg = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(environ.nearest_f32x4(builder, arg));
+        }
+        Operator::F64x2Nearest => {
+            let arg = pop1_with_bitcast(state, F64X2, builder);
+            state.push1(environ.nearest_f64x2(builder, arg));
         }
         Operator::I32x4DotI16x8S => {
             let (a, b) = pop2_with_bitcast(state, I16X8, builder);
diff --git a/tests/all/cli_tests.rs b/tests/all/cli_tests.rs
diff --git a/tests/disas/x64-simd-round-without-see41.wat b/tests/disas/x64-simd-round-without-see41.wat

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,8 @@`
`1`	`1`	`//! Print the path to the generated code.`
`2`	`2`
`3`	`3`	`fn main() {`
`4`		`- for path in cranelift_assembler_x64::generated_files() {`
	`4`	`+ let paths: Vec<std::path::PathBuf> = include!(concat!(env!("OUT_DIR"), "/generated-files.rs"));`
	`5`	`+ for path in paths {`
`5`	`6`	`println!("{}", path.display());`
`6`	`7`	`}`
`7`	`8`	`}`