Skip to content

Commit b571f53

Browse files
committed
Auto merge of rust-lang#116479 - scottmcm:no-1-simd, r=<try>
Copy 1-/2-element arrays as scalars, not vectors For `[T; 1]` it's silly to copy as `<1 x T>` when we can just copy as `T`. And treat `[T; 2]` as a scalar pair (like `(T, T)`) when copying it. Inspired by rust-lang#101210 (comment), which pointed out that `Option<[u8; 1]>` was codegenning worse than `Option<u8>`. (I'm not sure *why* LLVM doesn't optimize out `<1 x u8>`, but might as well just not emit it in the first place in this codepath.)
2 parents 1bc0463 + b5a9dd7 commit b571f53

File tree

5 files changed

+89
-5
lines changed

5 files changed

+89
-5
lines changed

compiler/rustc_codegen_llvm/src/type_of.rs

+13-5
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,6 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
383383

384384
// Vectors, even for non-power-of-two sizes, have the same layout as
385385
// arrays but don't count as aggregate types
386-
// While LLVM theoretically supports non-power-of-two sizes, and they
387-
// often work fine, sometimes x86-isel deals with them horribly
388-
// (see #115212) so for now only use power-of-two ones.
389386
if let FieldsShape::Array { count, .. } = self.layout.fields()
390387
&& count.is_power_of_two()
391388
&& let element = self.field(cx, 0)
@@ -396,8 +393,19 @@ impl<'tcx> LayoutLlvmExt<'tcx> for TyAndLayout<'tcx> {
396393
// up suppressing vectorization as it introduces shifts when it
397394
// extracts all the individual values.
398395

399-
let ety = element.llvm_type(cx);
400-
return Some(cx.type_vector(ety, *count));
396+
if *count <= 2 {
397+
// For short arrays, use LLVM's array type which it will unpack
398+
// out in optimizations to a scalar or pair of scalars.
399+
// (Having types like `<1 x u8>` is silly.)
400+
let ety = element.llvm_type(cx);
401+
return Some(cx.type_array(ety, *count));
402+
} else if count.is_power_of_two() {
403+
// While LLVM theoretically supports non-power-of-two sizes, and they
404+
// often work fine, sometimes x86-isel deals with them horribly
405+
// (see #115212) so for now only use power-of-two ones.
406+
let ety = element.llvm_type(cx);
407+
return Some(cx.type_vector(ety, *count));
408+
}
401409
}
402410

403411
// FIXME: The above only handled integer arrays; surely more things
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// assembly-output: emit-asm
2+
// compile-flags: --crate-type=lib -O -C llvm-args=-x86-asm-syntax=intel
3+
// only-x86_64
4+
// ignore-sgx
5+
6+
// This is emitted in a way that results in two loads and two stores in LLVM-IR.
7+
// Confirm that that doesn't mean 4 instructions in assembly.
8+
9+
// CHECK-LABEL: array_copy_2_elements:
10+
#[no_mangle]
11+
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
12+
// CHECK-NOT: byte
13+
// CHECK-NOT: mov
14+
// CHECK: mov{{.+}}, word ptr
15+
// CHECK-NEXT: mov word ptr
16+
// CHECK-NEXT: ret
17+
*p = *a;
18+
}

tests/codegen/array-clone.rs

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
// compile-flags: -O
2+
// min-llvm-version: 17 (earlier versions have trouble merging the loads)
23

34
#![crate_type = "lib"]
45

tests/codegen/array-codegen.rs

+22
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,25 @@ pub fn array_copy(a: &[u8; 4], p: &mut [u8; 4]) {
3232
// CHECK: store <4 x i8> %[[TEMP2]], ptr %p, align 1
3333
*p = *a;
3434
}
35+
36+
// CHECK-LABEL: @array_copy_1_element
37+
#[no_mangle]
38+
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
39+
// CHECK: %[[LOCAL:.+]] = alloca [1 x i8], align 1
40+
// CHECK: %[[TEMP1:.+]] = load [1 x i8], ptr %a, align 1
41+
// CHECK: store [1 x i8] %[[TEMP1]], ptr %[[LOCAL]], align 1
42+
// CHECK: %[[TEMP2:.+]] = load [1 x i8], ptr %[[LOCAL]], align 1
43+
// CHECK: store [1 x i8] %[[TEMP2]], ptr %p, align 1
44+
*p = *a;
45+
}
46+
47+
// CHECK-LABEL: @array_copy_2_elements
48+
#[no_mangle]
49+
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
50+
// CHECK: %[[LOCAL:.+]] = alloca [2 x i8], align 1
51+
// CHECK: %[[TEMP1:.+]] = load [2 x i8], ptr %a, align 1
52+
// CHECK: store [2 x i8] %[[TEMP1]], ptr %[[LOCAL]], align 1
53+
// CHECK: %[[TEMP2:.+]] = load [2 x i8], ptr %[[LOCAL]], align 1
54+
// CHECK: store [2 x i8] %[[TEMP2]], ptr %p, align 1
55+
*p = *a;
56+
}

tests/codegen/array-optimized.rs

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// compile-flags: -O
2+
3+
#![crate_type = "lib"]
4+
5+
// CHECK-LABEL: @array_copy_1_element
6+
#[no_mangle]
7+
pub fn array_copy_1_element(a: &[u8; 1], p: &mut [u8; 1]) {
8+
// CHECK-NOT: alloca
9+
// CHECK: %[[TEMP:.+]] = load i8, ptr %a, align 1
10+
// CHECK: store i8 %[[TEMP]], ptr %p, align 1
11+
// CHECK: ret
12+
*p = *a;
13+
}
14+
15+
// CHECK-LABEL: @array_copy_2_elements
16+
#[no_mangle]
17+
pub fn array_copy_2_elements(a: &[u8; 2], p: &mut [u8; 2]) {
18+
// CHECK-NOT: alloca
19+
// CHECK: %[[TEMP1:.+]] = load i8, ptr %a, align 1
20+
// CHECK: %[[TEMP2:.+]] = load i8, ptr
21+
// CHECK: store i8 %[[TEMP1]], ptr %p, align 1
22+
// CHECK: store i8 %[[TEMP2]], ptr
23+
// CHECK: ret
24+
*p = *a;
25+
}
26+
27+
// CHECK-LABEL: @array_copy_4_elements
28+
#[no_mangle]
29+
pub fn array_copy_4_elements(a: &[u8; 4], p: &mut [u8; 4]) {
30+
// CHECK-NOT: alloca
31+
// CHECK: %[[TEMP:.+]] = load <4 x i8>, ptr %a, align 1
32+
// CHECK: store <4 x i8> %[[TEMP]], ptr %p, align 1
33+
// CHECK: ret
34+
*p = *a;
35+
}

0 commit comments

Comments
 (0)