Basic vectorization performance regression from 1.48.0 onwards

Hi all. The compiler output for the test cases below is efficiently handled in 1.47. However, it has since been progressively unraveling into something quite awkward. This trend occurs on both x64 and ARM targets. All examples are compiled with `-C opt-level=3  -C lto=fat -C codegen-units=1`.

### Code

```rust
pub fn case_1(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    [
        a[0] + b[0],
        a[1] + b[1],
        a[2] + b[2],
        a[3] + b[3],
    ]
}

pub fn case_2(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
    let mut c = [0.0; 4];
    for i in 0..4 {
        c[i] = a[i] + b[i];
    }
    c
}
```

### 1.47.0 - Ok

https://rust.godbolt.org/z/33WhfM3n8

```asm
example::case_1:
        mov     rax, rdi
        vmovups xmm0, xmmword ptr [rsi]
        vaddps  xmm0, xmm0, xmmword ptr [rdx]
        vmovups xmmword ptr [rdi], xmm0
        ret

example::case_2:
        mov     rax, rdi
        vmovups xmm0, xmmword ptr [rsi]
        vaddps  xmm0, xmm0, xmmword ptr [rdx]
        vmovups xmmword ptr [rdi], xmm0
        ret
```

### 1.48.0 - Regression case_1

https://rust.godbolt.org/z/bqEre5dx5

```asm
example::case_1:
        vmovss  xmm0, dword ptr [rdi]
        vaddss  xmm0, xmm0, dword ptr [rsi]
        vmovss  xmm1, dword ptr [rdi + 4]
        vaddss  xmm1, xmm1, dword ptr [rsi + 4]
        vmovsd  xmm2, qword ptr [rdi + 8]
        vmovsd  xmm3, qword ptr [rsi + 8]
        vaddps  xmm2, xmm2, xmm3
        vmovd   eax, xmm0
        vmovd   ecx, xmm1
        vextractps      esi, xmm2, 0
        vextractps      edx, xmm2, 1
        shl     rdx, 32
        or      rdx, rsi
        shl     rcx, 32
        or      rax, rcx
        ret

example::case_2:
        vmovups xmm0, xmmword ptr [rdi]
        vaddps  xmm0, xmm0, xmmword ptr [rsi]
        vmovq   rax, xmm0
        vpextrq rdx, xmm0, 1
        ret
```

### 1.50.0 - Regression case_1 and case_2

https://rust.godbolt.org/z/Pj399ezPq

```asm
example::case_1:
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovd   xmm1, edi
        shr     rdi, 32
        vpinsrd xmm1, xmm1, edi, 1
        vmovd   xmm2, esi
        vmovd   xmm3, edx
        shr     rdx, 32
        vpinsrd xmm3, xmm3, edx, 1
        vaddps  xmm1, xmm1, xmm3
        vmovd   xmm3, ecx
        shr     rcx, 32
        vaddss  xmm0, xmm0, xmm3
        vmovd   xmm3, ecx
        vaddss  xmm2, xmm2, xmm3
        vmovd   edx, xmm0
        vmovd   eax, xmm2
        shl     rax, 32
        or      rdx, rax
        vxorps  xmm0, xmm0, xmm0
        vblendps        xmm0, xmm1, xmm0, 2
        vmovq   rcx, xmm0
        vmovshdup       xmm0, xmm1
        vmovq   rax, xmm0
        shl     rax, 32
        or      rax, rcx
        ret

example::case_2:
        vmovq   xmm0, rcx
        vmovq   xmm1, rdx
        vpunpcklqdq     xmm0, xmm1, xmm0
        vmovq   xmm1, rsi
        vmovq   xmm2, rdi
        vpunpcklqdq     xmm1, xmm2, xmm1
        vaddps  xmm0, xmm1, xmm0
        vmovq   rax, xmm0
        vpextrq rdx, xmm0, 1
        ret
```

### 1.52.0 - Further regression case_2

https://rust.godbolt.org/z/KsvbW5vhW

```asm
example::case_1:
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovd   xmm1, edi
        shr     rdi, 32
        vpinsrd xmm1, xmm1, edi, 1
        vmovd   xmm2, esi
        vmovd   xmm3, edx
        shr     rdx, 32
        vpinsrd xmm3, xmm3, edx, 1
        vaddps  xmm1, xmm1, xmm3
        vmovd   xmm3, ecx
        shr     rcx, 32
        vaddss  xmm0, xmm0, xmm3
        vmovd   xmm3, ecx
        vaddss  xmm2, xmm2, xmm3
        vmovd   edx, xmm0
        vmovd   eax, xmm2
        shl     rax, 32
        or      rdx, rax
        vxorps  xmm0, xmm0, xmm0
        vblendps        xmm0, xmm0, xmm1, 1
        vmovq   rcx, xmm0
        vmovshdup       xmm0, xmm1
        vmovq   rax, xmm0
        shl     rax, 32
        or      rax, rcx
        ret

example::case_2:
        mov     rax, rsi
        shld    rax, rdi, 32
        vmovd   xmm0, esi
        shr     rsi, 32
        vmovq   xmm1, rax
        vmovq   xmm2, rsi
        vpunpckldq      xmm1, xmm2, xmm1
        vmovd   xmm2, ecx
        vaddss  xmm0, xmm0, xmm2
        vmovd   xmm2, edx
        shrd    rdx, rcx, 32
        shr     rcx, 32
        vmovq   xmm3, rdx
        vmovq   xmm4, rcx
        vpunpckldq      xmm3, xmm4, xmm3
        vmovd   xmm4, edi
        vaddps  xmm1, xmm1, xmm3
        vaddps  xmm2, xmm2, xmm4
        vextractps      eax, xmm2, 0
        vmovd   ecx, xmm0
        vextractps      edx, xmm1, 0
        vextractps      esi, xmm1, 1
        shl     rsi, 32
        shl     rdx, 32
        or      rdx, rcx
        or      rax, rsi
        ret
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Basic vectorization performance regression from 1.48.0 onwards #85265

Code

1.47.0 - Ok

1.48.0 - Regression case_1

1.50.0 - Regression case_1 and case_2

1.52.0 - Further regression case_2

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Basic vectorization performance regression from 1.48.0 onwards #85265

Description

Code

1.47.0 - Ok

1.48.0 - Regression case_1

1.50.0 - Regression case_1 and case_2

1.52.0 - Further regression case_2

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions