Skip to content

Commit 402a6a5

Browse files
authored
Fix another libcall popping up with simd (#10735)
This commit is similar to #10699, another instance of a libcall popping up late in the x64 backend. Fuzzing found this issue and to help verify this is the last one I've run the whole `*.wast` test suite with the x86_64 baseline (no target features) and saw the panic before this PR and no more panics after.
1 parent 5ded0f4 commit 402a6a5

File tree

2 files changed

+156
-117
lines changed

2 files changed

+156
-117
lines changed

crates/cranelift/src/translate/code_translator.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2041,11 +2041,26 @@ pub fn translate_operator(
20412041
}
20422042
Operator::I32x4RelaxedTruncF64x2UZero | Operator::I32x4TruncSatF64x2UZero => {
20432043
let a = pop1_with_bitcast(state, F64X2, builder);
2044-
let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
2045-
let handle = builder.func.dfg.constants.insert(vec![0u8; 16].into());
2046-
let zero = builder.ins().vconst(I64X2, handle);
2047-
2048-
state.push1(builder.ins().uunarrow(converted_a, zero));
2044+
let zero_constant = builder.func.dfg.constants.insert(vec![0u8; 16].into());
2045+
let result = if environ.is_x86() && !environ.isa().has_round() {
2046+
// On x86 the vector lowering for `fcvt_to_uint_sat` requires
2047+
// SSE4.1 `round` instructions. If SSE4.1 isn't available it
2048+
// falls back to a libcall which we don't want in Wasmtime.
2049+
// Handle this by falling back to the scalar implementation
2050+
// which does not require SSE4.1 instructions.
2051+
let lane0 = builder.ins().extractlane(a, 0);
2052+
let lane1 = builder.ins().extractlane(a, 1);
2053+
let lane0_rounded = builder.ins().fcvt_to_uint_sat(I32, lane0);
2054+
let lane1_rounded = builder.ins().fcvt_to_uint_sat(I32, lane1);
2055+
let result = builder.ins().vconst(I32X4, zero_constant);
2056+
let result = builder.ins().insertlane(result, lane0_rounded, 0);
2057+
builder.ins().insertlane(result, lane1_rounded, 1)
2058+
} else {
2059+
let converted_a = builder.ins().fcvt_to_uint_sat(I64X2, a);
2060+
let zero = builder.ins().vconst(I64X2, zero_constant);
2061+
builder.ins().uunarrow(converted_a, zero)
2062+
};
2063+
state.push1(result);
20492064
}
20502065

20512066
Operator::I8x16NarrowI16x8S => {

tests/disas/x64-simd-round-without-see41.wat

Lines changed: 136 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
(func $f64x2.floor (param v128) (result v128) (f64x2.floor (local.get 0)))
1010
(func $f64x2.trunc (param v128) (result v128) (f64x2.trunc (local.get 0)))
1111
(func $f64x2.nearest (param v128) (result v128) (f64x2.nearest (local.get 0)))
12+
(func $i32x4.trunc_sat_f64x2_u_zero (param v128) (result v128) (i32x4.trunc_sat_f64x2_u_zero (local.get 0)))
1213
)
1314
;; function u0:0(i64 vmctx, i64, i8x16) -> i8x16 tail {
1415
;; gv0 = vmctx
@@ -21,25 +22,25 @@
2122
;; stack_limit = gv2
2223
;;
2324
;; block0(v0: i64, v1: i64, v2: i8x16):
24-
;; @0022 v4 = bitcast.f32x4 little v2
25-
;; @0022 v6 = vconst.f32x4 const0
26-
;; @0022 v7 = extractlane v4, 0
27-
;; @0022 v8 = call fn0(v0, v7)
28-
;; @0022 v9 = insertlane v6, v8, 0 ; v6 = const0
29-
;; @0022 v10 = extractlane v4, 1
30-
;; @0022 v11 = call fn0(v0, v10)
31-
;; @0022 v12 = insertlane v9, v11, 1
32-
;; @0022 v13 = extractlane v4, 2
33-
;; @0022 v14 = call fn0(v0, v13)
34-
;; @0022 v15 = insertlane v12, v14, 2
35-
;; @0022 v16 = extractlane v4, 3
36-
;; @0022 v17 = call fn0(v0, v16)
37-
;; @0022 v18 = insertlane v15, v17, 3
38-
;; @0024 v19 = bitcast.i8x16 little v18
39-
;; @0024 jump block1
25+
;; @0023 v4 = bitcast.f32x4 little v2
26+
;; @0023 v6 = vconst.f32x4 const0
27+
;; @0023 v7 = extractlane v4, 0
28+
;; @0023 v8 = call fn0(v0, v7)
29+
;; @0023 v9 = insertlane v6, v8, 0 ; v6 = const0
30+
;; @0023 v10 = extractlane v4, 1
31+
;; @0023 v11 = call fn0(v0, v10)
32+
;; @0023 v12 = insertlane v9, v11, 1
33+
;; @0023 v13 = extractlane v4, 2
34+
;; @0023 v14 = call fn0(v0, v13)
35+
;; @0023 v15 = insertlane v12, v14, 2
36+
;; @0023 v16 = extractlane v4, 3
37+
;; @0023 v17 = call fn0(v0, v16)
38+
;; @0023 v18 = insertlane v15, v17, 3
39+
;; @0025 v19 = bitcast.i8x16 little v18
40+
;; @0025 jump block1
4041
;;
4142
;; block1:
42-
;; @0024 return v19
43+
;; @0025 return v19
4344
;; }
4445
;;
4546
;; function u0:1(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -53,25 +54,25 @@
5354
;; stack_limit = gv2
5455
;;
5556
;; block0(v0: i64, v1: i64, v2: i8x16):
56-
;; @0029 v4 = bitcast.f32x4 little v2
57-
;; @0029 v6 = vconst.f32x4 const0
58-
;; @0029 v7 = extractlane v4, 0
59-
;; @0029 v8 = call fn0(v0, v7)
60-
;; @0029 v9 = insertlane v6, v8, 0 ; v6 = const0
61-
;; @0029 v10 = extractlane v4, 1
62-
;; @0029 v11 = call fn0(v0, v10)
63-
;; @0029 v12 = insertlane v9, v11, 1
64-
;; @0029 v13 = extractlane v4, 2
65-
;; @0029 v14 = call fn0(v0, v13)
66-
;; @0029 v15 = insertlane v12, v14, 2
67-
;; @0029 v16 = extractlane v4, 3
68-
;; @0029 v17 = call fn0(v0, v16)
69-
;; @0029 v18 = insertlane v15, v17, 3
70-
;; @002b v19 = bitcast.i8x16 little v18
71-
;; @002b jump block1
57+
;; @002a v4 = bitcast.f32x4 little v2
58+
;; @002a v6 = vconst.f32x4 const0
59+
;; @002a v7 = extractlane v4, 0
60+
;; @002a v8 = call fn0(v0, v7)
61+
;; @002a v9 = insertlane v6, v8, 0 ; v6 = const0
62+
;; @002a v10 = extractlane v4, 1
63+
;; @002a v11 = call fn0(v0, v10)
64+
;; @002a v12 = insertlane v9, v11, 1
65+
;; @002a v13 = extractlane v4, 2
66+
;; @002a v14 = call fn0(v0, v13)
67+
;; @002a v15 = insertlane v12, v14, 2
68+
;; @002a v16 = extractlane v4, 3
69+
;; @002a v17 = call fn0(v0, v16)
70+
;; @002a v18 = insertlane v15, v17, 3
71+
;; @002c v19 = bitcast.i8x16 little v18
72+
;; @002c jump block1
7273
;;
7374
;; block1:
74-
;; @002b return v19
75+
;; @002c return v19
7576
;; }
7677
;;
7778
;; function u0:2(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -85,25 +86,25 @@
8586
;; stack_limit = gv2
8687
;;
8788
;; block0(v0: i64, v1: i64, v2: i8x16):
88-
;; @0030 v4 = bitcast.f32x4 little v2
89-
;; @0030 v6 = vconst.f32x4 const0
90-
;; @0030 v7 = extractlane v4, 0
91-
;; @0030 v8 = call fn0(v0, v7)
92-
;; @0030 v9 = insertlane v6, v8, 0 ; v6 = const0
93-
;; @0030 v10 = extractlane v4, 1
94-
;; @0030 v11 = call fn0(v0, v10)
95-
;; @0030 v12 = insertlane v9, v11, 1
96-
;; @0030 v13 = extractlane v4, 2
97-
;; @0030 v14 = call fn0(v0, v13)
98-
;; @0030 v15 = insertlane v12, v14, 2
99-
;; @0030 v16 = extractlane v4, 3
100-
;; @0030 v17 = call fn0(v0, v16)
101-
;; @0030 v18 = insertlane v15, v17, 3
102-
;; @0032 v19 = bitcast.i8x16 little v18
103-
;; @0032 jump block1
89+
;; @0031 v4 = bitcast.f32x4 little v2
90+
;; @0031 v6 = vconst.f32x4 const0
91+
;; @0031 v7 = extractlane v4, 0
92+
;; @0031 v8 = call fn0(v0, v7)
93+
;; @0031 v9 = insertlane v6, v8, 0 ; v6 = const0
94+
;; @0031 v10 = extractlane v4, 1
95+
;; @0031 v11 = call fn0(v0, v10)
96+
;; @0031 v12 = insertlane v9, v11, 1
97+
;; @0031 v13 = extractlane v4, 2
98+
;; @0031 v14 = call fn0(v0, v13)
99+
;; @0031 v15 = insertlane v12, v14, 2
100+
;; @0031 v16 = extractlane v4, 3
101+
;; @0031 v17 = call fn0(v0, v16)
102+
;; @0031 v18 = insertlane v15, v17, 3
103+
;; @0033 v19 = bitcast.i8x16 little v18
104+
;; @0033 jump block1
104105
;;
105106
;; block1:
106-
;; @0032 return v19
107+
;; @0033 return v19
107108
;; }
108109
;;
109110
;; function u0:3(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -117,25 +118,25 @@
117118
;; stack_limit = gv2
118119
;;
119120
;; block0(v0: i64, v1: i64, v2: i8x16):
120-
;; @0037 v4 = bitcast.f32x4 little v2
121-
;; @0037 v6 = vconst.f32x4 const0
122-
;; @0037 v7 = extractlane v4, 0
123-
;; @0037 v8 = call fn0(v0, v7)
124-
;; @0037 v9 = insertlane v6, v8, 0 ; v6 = const0
125-
;; @0037 v10 = extractlane v4, 1
126-
;; @0037 v11 = call fn0(v0, v10)
127-
;; @0037 v12 = insertlane v9, v11, 1
128-
;; @0037 v13 = extractlane v4, 2
129-
;; @0037 v14 = call fn0(v0, v13)
130-
;; @0037 v15 = insertlane v12, v14, 2
131-
;; @0037 v16 = extractlane v4, 3
132-
;; @0037 v17 = call fn0(v0, v16)
133-
;; @0037 v18 = insertlane v15, v17, 3
134-
;; @0039 v19 = bitcast.i8x16 little v18
135-
;; @0039 jump block1
121+
;; @0038 v4 = bitcast.f32x4 little v2
122+
;; @0038 v6 = vconst.f32x4 const0
123+
;; @0038 v7 = extractlane v4, 0
124+
;; @0038 v8 = call fn0(v0, v7)
125+
;; @0038 v9 = insertlane v6, v8, 0 ; v6 = const0
126+
;; @0038 v10 = extractlane v4, 1
127+
;; @0038 v11 = call fn0(v0, v10)
128+
;; @0038 v12 = insertlane v9, v11, 1
129+
;; @0038 v13 = extractlane v4, 2
130+
;; @0038 v14 = call fn0(v0, v13)
131+
;; @0038 v15 = insertlane v12, v14, 2
132+
;; @0038 v16 = extractlane v4, 3
133+
;; @0038 v17 = call fn0(v0, v16)
134+
;; @0038 v18 = insertlane v15, v17, 3
135+
;; @003a v19 = bitcast.i8x16 little v18
136+
;; @003a jump block1
136137
;;
137138
;; block1:
138-
;; @0039 return v19
139+
;; @003a return v19
139140
;; }
140141
;;
141142
;; function u0:4(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -149,19 +150,19 @@
149150
;; stack_limit = gv2
150151
;;
151152
;; block0(v0: i64, v1: i64, v2: i8x16):
152-
;; @003e v4 = bitcast.f64x2 little v2
153-
;; @003e v6 = vconst.f64x2 const0
154-
;; @003e v7 = extractlane v4, 0
155-
;; @003e v8 = call fn0(v0, v7)
156-
;; @003e v9 = insertlane v6, v8, 0 ; v6 = const0
157-
;; @003e v10 = extractlane v4, 1
158-
;; @003e v11 = call fn0(v0, v10)
159-
;; @003e v12 = insertlane v9, v11, 1
160-
;; @0040 v13 = bitcast.i8x16 little v12
161-
;; @0040 jump block1
153+
;; @003f v4 = bitcast.f64x2 little v2
154+
;; @003f v6 = vconst.f64x2 const0
155+
;; @003f v7 = extractlane v4, 0
156+
;; @003f v8 = call fn0(v0, v7)
157+
;; @003f v9 = insertlane v6, v8, 0 ; v6 = const0
158+
;; @003f v10 = extractlane v4, 1
159+
;; @003f v11 = call fn0(v0, v10)
160+
;; @003f v12 = insertlane v9, v11, 1
161+
;; @0041 v13 = bitcast.i8x16 little v12
162+
;; @0041 jump block1
162163
;;
163164
;; block1:
164-
;; @0040 return v13
165+
;; @0041 return v13
165166
;; }
166167
;;
167168
;; function u0:5(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -175,19 +176,19 @@
175176
;; stack_limit = gv2
176177
;;
177178
;; block0(v0: i64, v1: i64, v2: i8x16):
178-
;; @0045 v4 = bitcast.f64x2 little v2
179-
;; @0045 v6 = vconst.f64x2 const0
180-
;; @0045 v7 = extractlane v4, 0
181-
;; @0045 v8 = call fn0(v0, v7)
182-
;; @0045 v9 = insertlane v6, v8, 0 ; v6 = const0
183-
;; @0045 v10 = extractlane v4, 1
184-
;; @0045 v11 = call fn0(v0, v10)
185-
;; @0045 v12 = insertlane v9, v11, 1
186-
;; @0047 v13 = bitcast.i8x16 little v12
187-
;; @0047 jump block1
179+
;; @0046 v4 = bitcast.f64x2 little v2
180+
;; @0046 v6 = vconst.f64x2 const0
181+
;; @0046 v7 = extractlane v4, 0
182+
;; @0046 v8 = call fn0(v0, v7)
183+
;; @0046 v9 = insertlane v6, v8, 0 ; v6 = const0
184+
;; @0046 v10 = extractlane v4, 1
185+
;; @0046 v11 = call fn0(v0, v10)
186+
;; @0046 v12 = insertlane v9, v11, 1
187+
;; @0048 v13 = bitcast.i8x16 little v12
188+
;; @0048 jump block1
188189
;;
189190
;; block1:
190-
;; @0047 return v13
191+
;; @0048 return v13
191192
;; }
192193
;;
193194
;; function u0:6(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -201,19 +202,19 @@
201202
;; stack_limit = gv2
202203
;;
203204
;; block0(v0: i64, v1: i64, v2: i8x16):
204-
;; @004c v4 = bitcast.f64x2 little v2
205-
;; @004c v6 = vconst.f64x2 const0
206-
;; @004c v7 = extractlane v4, 0
207-
;; @004c v8 = call fn0(v0, v7)
208-
;; @004c v9 = insertlane v6, v8, 0 ; v6 = const0
209-
;; @004c v10 = extractlane v4, 1
210-
;; @004c v11 = call fn0(v0, v10)
211-
;; @004c v12 = insertlane v9, v11, 1
212-
;; @004e v13 = bitcast.i8x16 little v12
213-
;; @004e jump block1
205+
;; @004d v4 = bitcast.f64x2 little v2
206+
;; @004d v6 = vconst.f64x2 const0
207+
;; @004d v7 = extractlane v4, 0
208+
;; @004d v8 = call fn0(v0, v7)
209+
;; @004d v9 = insertlane v6, v8, 0 ; v6 = const0
210+
;; @004d v10 = extractlane v4, 1
211+
;; @004d v11 = call fn0(v0, v10)
212+
;; @004d v12 = insertlane v9, v11, 1
213+
;; @004f v13 = bitcast.i8x16 little v12
214+
;; @004f jump block1
214215
;;
215216
;; block1:
216-
;; @004e return v13
217+
;; @004f return v13
217218
;; }
218219
;;
219220
;; function u0:7(i64 vmctx, i64, i8x16) -> i8x16 tail {
@@ -227,17 +228,40 @@
227228
;; stack_limit = gv2
228229
;;
229230
;; block0(v0: i64, v1: i64, v2: i8x16):
230-
;; @0053 v4 = bitcast.f64x2 little v2
231-
;; @0053 v6 = vconst.f64x2 const0
232-
;; @0053 v7 = extractlane v4, 0
233-
;; @0053 v8 = call fn0(v0, v7)
234-
;; @0053 v9 = insertlane v6, v8, 0 ; v6 = const0
235-
;; @0053 v10 = extractlane v4, 1
236-
;; @0053 v11 = call fn0(v0, v10)
237-
;; @0053 v12 = insertlane v9, v11, 1
238-
;; @0056 v13 = bitcast.i8x16 little v12
239-
;; @0056 jump block1
231+
;; @0054 v4 = bitcast.f64x2 little v2
232+
;; @0054 v6 = vconst.f64x2 const0
233+
;; @0054 v7 = extractlane v4, 0
234+
;; @0054 v8 = call fn0(v0, v7)
235+
;; @0054 v9 = insertlane v6, v8, 0 ; v6 = const0
236+
;; @0054 v10 = extractlane v4, 1
237+
;; @0054 v11 = call fn0(v0, v10)
238+
;; @0054 v12 = insertlane v9, v11, 1
239+
;; @0057 v13 = bitcast.i8x16 little v12
240+
;; @0057 jump block1
240241
;;
241242
;; block1:
242-
;; @0056 return v13
243+
;; @0057 return v13
244+
;; }
245+
;;
246+
;; function u0:8(i64 vmctx, i64, i8x16) -> i8x16 tail {
247+
;; gv0 = vmctx
248+
;; gv1 = load.i64 notrap aligned readonly gv0+8
249+
;; gv2 = load.i64 notrap aligned gv1+16
250+
;; const0 = 0x00000000000000000000000000000000
251+
;; stack_limit = gv2
252+
;;
253+
;; block0(v0: i64, v1: i64, v2: i8x16):
254+
;; @005c v4 = bitcast.f64x2 little v2
255+
;; @005c v5 = extractlane v4, 0
256+
;; @005c v6 = extractlane v4, 1
257+
;; @005c v7 = fcvt_to_uint_sat.i32 v5
258+
;; @005c v8 = fcvt_to_uint_sat.i32 v6
259+
;; @005c v9 = vconst.i32x4 const0
260+
;; @005c v10 = insertlane v9, v7, 0 ; v9 = const0
261+
;; @005c v11 = insertlane v10, v8, 1
262+
;; @005f v12 = bitcast.i8x16 little v11
263+
;; @005f jump block1
264+
;;
265+
;; block1:
266+
;; @005f return v12
243267
;; }

0 commit comments

Comments
 (0)