Skip to content

Commit 7b69d21

Browse files
committed
Don't aggregate homogeneous floats in the Rust ABI
1 parent 2e374cf commit 7b69d21

File tree

4 files changed

+143
-2
lines changed

4 files changed

+143
-2
lines changed

Diff for: compiler/rustc_middle/src/ty/layout.rs

+20-2
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
1313
use rustc_span::symbol::Symbol;
1414
use rustc_span::{Span, DUMMY_SP};
1515
use rustc_target::abi::call::{
16-
ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
16+
ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
17+
Reg, RegKind,
1718
};
1819
use rustc_target::abi::*;
1920
use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3203,10 +3204,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
32033204

32043205
if arg.layout.is_unsized() || size > max_by_val_size {
32053206
arg.make_indirect();
3207+
} else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
3208+
kind: RegKind::Float,
3209+
..
3210+
})) = arg.layout.homogeneous_aggregate(self)
3211+
{
3212+
// We don't want to aggregate floats as an aggregates of Integer
3213+
// because this will hurt the generated assembly (#93490)
3214+
//
3215+
// As an optimization we want to pass homogeneous aggregate of floats
3216+
// greater than pointer size as indirect
3217+
if size > Pointer.size(self) {
3218+
arg.make_indirect();
3219+
}
32063220
} else {
32073221
// We want to pass small aggregates as immediates, but using
32083222
// a LLVM aggregate type for this leads to bad optimizations,
32093223
// so we pick an appropriately sized integer type instead.
3224+
//
3225+
// NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
3226+
// we could do ([f32; 2], u64) which is better but this is the best we
3227+
// can do right now.
32103228
arg.cast_to(Reg { kind: RegKind::Integer, size });
32113229
}
32123230
}
@@ -3237,7 +3255,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
32373255
arg.make_indirect();
32383256
}
32393257

3240-
_ => {},
3258+
_ => {}
32413259
}
32423260
};
32433261
fixup(&mut fn_abi.ret);

Diff for: src/test/assembly/x86-64-homogenous-floats.rs

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
// assembly-output: emit-asm
2+
// needs-llvm-components: x86
3+
// compile-flags: --target x86_64-unknown-linux-gnu
4+
// compile-flags: -C llvm-args=--x86-asm-syntax=intel
5+
// compile-flags: -C opt-level=3
6+
7+
#![crate_type = "rlib"]
8+
#![no_std]
9+
10+
// CHECK-LABEL: sum_f32:
11+
// CHECK: addss xmm0, xmm1
12+
// CHECK-NEXT: ret
13+
#[no_mangle]
14+
pub fn sum_f32(a: f32, b: f32) -> f32 {
15+
a + b
16+
}
17+
18+
// CHECK-LABEL: sum_f32x2:
19+
// CHECK: addss xmm{{[0-9]}}, xmm{{[0-9]}}
20+
// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
21+
// CHECK-NEXT: ret
22+
#[no_mangle]
23+
pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
24+
[
25+
a[0] + b[0],
26+
a[1] + b[1],
27+
]
28+
}
29+
30+
// CHECK-LABEL: sum_f32x4:
31+
// CHECK: mov rax, [[PTR_IN:.*]]
32+
// CHECK-NEXT: movups [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
33+
// CHECK-NEXT: movups [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
34+
// CHECK-NEXT: addps [[XMMB]], [[XMMA]]
35+
// CHECK-NEXT: movups xmmword ptr [[[PTR_IN]]], [[XMMB]]
36+
// CHECK-NEXT: ret
37+
#[no_mangle]
38+
pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
39+
[
40+
a[0] + b[0],
41+
a[1] + b[1],
42+
a[2] + b[2],
43+
a[3] + b[3],
44+
]
45+
}

Diff for: src/test/codegen/homogeneous-floats.rs

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
//! Check that small (less then 128bits on x86_64) homogeneous floats are either pass as an array
2+
//! or by a pointer
3+
4+
// compile-flags: -C no-prepopulate-passes -O
5+
// only-x86_64
6+
7+
#![crate_type = "lib"]
8+
9+
pub struct Foo {
10+
bar1: f32,
11+
bar2: f32,
12+
bar3: f32,
13+
bar4: f32,
14+
}
15+
16+
// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
17+
#[no_mangle]
18+
pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
19+
todo!()
20+
}
21+
22+
// CHECK: define void @array_f32x4([4 x float]* {{.*}} sret([4 x float]) {{.*}} %0, [4 x float]* {{.*}} %a, [4 x float]* {{.*}} %b)
23+
#[no_mangle]
24+
pub fn array_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
25+
todo!()
26+
}
27+
28+
// CHECK: define void @array_f32x4_nested(%Foo* {{.*}} sret(%Foo) {{.*}} %0, %Foo* {{.*}} %a, %Foo* {{.*}} %b)
29+
#[no_mangle]
30+
pub fn array_f32x4_nested(a: Foo, b: Foo) -> Foo {
31+
todo!()
32+
}

Diff for: src/test/ui/abi/homogenous-floats.rs

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// This test that no matter the optimization level or the target feature enable, the non
2+
// aggregation of homogenous floats in the abi is sound and still produce the right answer.
3+
4+
// revisions: opt-0 opt-0-native opt-1 opt-1-native opt-2 opt-2-native opt-3 opt-3-native
5+
// [opt-0]: compile-flags: -C opt-level=0
6+
// [opt-1]: compile-flags: -C opt-level=1
7+
// [opt-2]: compile-flags: -C opt-level=2
8+
// [opt-3]: compile-flags: -C opt-level=3
9+
// [opt-0-native]: compile-flags: -C target-cpu=native
10+
// [opt-1-native]: compile-flags: -C target-cpu=native
11+
// [opt-2-native]: compile-flags: -C target-cpu=native
12+
// [opt-3-native]: compile-flags: -C target-cpu=native
13+
// run-pass
14+
15+
#![feature(core_intrinsics)]
16+
17+
use std::intrinsics::black_box;
18+
19+
pub fn sum_f32(a: f32, b: f32) -> f32 {
20+
a + b
21+
}
22+
23+
pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
24+
[a[0] + b[0], a[1] + b[1]]
25+
}
26+
27+
pub fn sum_f32x3(a: [f32; 3], b: [f32; 3]) -> [f32; 3] {
28+
[a[0] + b[0], a[1] + b[1], a[2] + b[2]]
29+
}
30+
31+
pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
32+
[a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
33+
}
34+
35+
fn main() {
36+
assert_eq!(1., black_box(sum_f32(black_box(0.), black_box(1.))));
37+
assert_eq!([2., 2.], black_box(sum_f32x2(black_box([2., 0.]), black_box([0., 2.]))));
38+
assert_eq!(
39+
[3., 3., 3.],
40+
black_box(sum_f32x3(black_box([1., 2., 3.]), black_box([2., 1., 0.])))
41+
);
42+
assert_eq!(
43+
[4., 4., 4., 4.],
44+
black_box(sum_f32x4(black_box([1., 2., 3., 4.]), black_box([3., 2., 1., 0.])))
45+
);
46+
}

0 commit comments

Comments
 (0)