Skip to content

Commit 99ab363

Browse files
Use -0.0 in intrinsics::simd::reduce_add_unordered
-0.0 is the actual neutral additive float, not +0.0, and this matters to codegen.
1 parent 0307e40 commit 99ab363

File tree

2 files changed

+40
-2
lines changed

2 files changed

+40
-2
lines changed

compiler/rustc_codegen_llvm/src/intrinsic.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -2090,14 +2090,14 @@ fn generic_simd_intrinsic<'ll, 'tcx>(
20902090
};
20912091
}
20922092

2093-
arith_red!(simd_reduce_add_ordered: vector_reduce_add, vector_reduce_fadd, true, add, 0.0);
2093+
arith_red!(simd_reduce_add_ordered: vector_reduce_add, vector_reduce_fadd, true, add, -0.0);
20942094
arith_red!(simd_reduce_mul_ordered: vector_reduce_mul, vector_reduce_fmul, true, mul, 1.0);
20952095
arith_red!(
20962096
simd_reduce_add_unordered: vector_reduce_add,
20972097
vector_reduce_fadd_reassoc,
20982098
false,
20992099
add,
2100-
0.0
2100+
-0.0
21012101
);
21022102
arith_red!(
21032103
simd_reduce_mul_unordered: vector_reduce_mul,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
//@ revisions: x86_64 x86_64-avx2 aarch64
2+
//@ assembly-output: emit-asm
3+
//@ compile-flags: --crate-type=lib -O
4+
//@[aarch64] only-aarch64
5+
//@[x86_64] only-x86_64
6+
//@[x86_64-avx2] only-x86_64
7+
//@[x86_64-avx2] compile-flags: -Ctarget-cpu=x86-64-v3
8+
#![feature(portable_simd)]
9+
#![feature(core_intrinsics)]
10+
use std::intrinsics::simd as intrinsics;
11+
use std::simd::*;
12+
13+
// Regression test for https://github.com/rust-lang/rust/issues/130028
14+
// This intrinsic produces much worse code if you use +0.0 instead of -0.0 because
15+
// +0.0 isn't as easy to algebraically reassociate, even using LLVM's reassoc attribute!
16+
// It would emit about an extra fadd, depending on the architecture.
17+
18+
19+
// CHECK-LABEL: reduce_fadd_negative_zero
20+
pub unsafe fn reduce_fadd_negative_zero(v: f32x4) -> f32 {
21+
// x86_64: addps
22+
// x86_64: movaps
23+
// x86_64: shufps
24+
// x86_64: addss
25+
// x86_64-NOT: xorps
26+
27+
// x86_64-avx2: vaddps
28+
// x86_64-avx2-NEXT: vmovshdup
29+
// x86_64-avx2-NEXT: vaddss
30+
// x86_64-avx2-NOT: vxorps
31+
32+
// aarch64: faddp
33+
// aarch64-NEXT: faddp
34+
35+
// CHECK-NOT: {{f?}}add{{p?s*}}
36+
// CHECK: ret
37+
intrinsics::simd_reduce_add_unordered(v)
38+
}

0 commit comments

Comments
 (0)