Missed optimization with autovectorized saturating truncation (clamp) #104875

okaneco · 2024-08-19T22:15:49Z

In Rust, trying to clamp and truncate from a slice of i32 to a slice of u8 using the standard library's clamp function produces more instructions than manually clamping with max and min.

pub fn clamp(input: &[i32], output: &mut [u8]) {
    for (&i, o) in input.iter().zip(output.iter_mut()) {
        *o = i.clamp(0, 255) as u8;
    }
}
pub fn manual_clamp(input: &[i32], output: &mut [u8]) {
    for (&i, o) in input.iter().zip(output.iter_mut()) {
        *o = i.max(0).min(255) as u8;
    }
}

https://rust.godbolt.org/z/zf73jsqjq

Assembly instructions

Clamp

.LBB0_4:
        movdqu  xmm6, xmmword ptr [rdi + 4*r8]
        movdqu  xmm5, xmmword ptr [rdi + 4*r8 + 16]
        pxor    xmm3, xmm3
        pcmpgtd xmm3, xmm6
        packssdw        xmm3, xmm3
        packsswb        xmm3, xmm3
        pxor    xmm4, xmm4
        pcmpgtd xmm4, xmm5
        packssdw        xmm4, xmm4
        packsswb        xmm4, xmm4
        movdqa  xmm7, xmm6
        pxor    xmm7, xmm0
        movdqa  xmm8, xmm1
        pcmpgtd xmm8, xmm7
        pand    xmm6, xmm8
        pandn   xmm8, xmm2
        por     xmm8, xmm6
        packuswb        xmm8, xmm8
        packuswb        xmm8, xmm8
        pandn   xmm3, xmm8
        movdqa  xmm6, xmm5
        pxor    xmm6, xmm0
        movdqa  xmm7, xmm1
        pcmpgtd xmm7, xmm6
        pand    xmm5, xmm7
        pandn   xmm7, xmm2
        por     xmm7, xmm5
        packuswb        xmm7, xmm7
        packuswb        xmm7, xmm7
        pandn   xmm4, xmm7
        movd    dword ptr [rdx + r8], xmm3
        movd    dword ptr [rdx + r8 + 4], xmm4
        add     r8, 8
        cmp     rsi, r8
        jne     .LBB0_4

Manual clamp

.LBB0_4:
        movdqu  xmm0, xmmword ptr [rdi + 4*r8]
        packssdw        xmm0, xmm0
        packuswb        xmm0, xmm0
        movdqu  xmm1, xmmword ptr [rdi + 4*r8 + 16]
        packssdw        xmm1, xmm1
        packuswb        xmm1, xmm1
        movd    dword ptr [rdx + r8], xmm0
        movd    dword ptr [rdx + r8 + 4], xmm1
        add     r8, 8
        cmp     rsi, r8
        jne     .LBB0_4

Emitted IR - https://alive2.llvm.org/ce/z/hbU88w

Clamp

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds i32, ptr %input.0, i64 %index
  %1 = getelementptr inbounds i8, ptr %output.0, i64 %index
  %2 = getelementptr inbounds i8, ptr %0, i64 16
  %wide.load = load <4 x i32>, ptr %0, align 4
  %wide.load6 = load <4 x i32>, ptr %2, align 4
  %3 = icmp slt <4 x i32> %wide.load, zeroinitializer
  %4 = icmp slt <4 x i32> %wide.load6, zeroinitializer
  %5 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
  %6 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %wide.load6, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
  %7 = trunc nuw <4 x i32> %5 to <4 x i8>
  %8 = trunc nuw <4 x i32> %6 to <4 x i8>
  %9 = select <4 x i1> %3, <4 x i8> zeroinitializer, <4 x i8> %7
  %10 = select <4 x i1> %4, <4 x i8> zeroinitializer, <4 x i8> %8
  %11 = getelementptr inbounds i8, ptr %1, i64 4
  store <4 x i8> %9, ptr %1, align 1
  store <4 x i8> %10, ptr %11, align 1
  %index.next = add nuw i64 %index, 8
  %12 = icmp eq i64 %index.next, %n.vec
  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !3

Manual clamp

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds i32, ptr %input.0, i64 %index
  %1 = getelementptr inbounds i8, ptr %output.0, i64 %index
  %2 = getelementptr inbounds i8, ptr %0, i64 16
  %wide.load = load <4 x i32>, ptr %0, align 4
  %wide.load7 = load <4 x i32>, ptr %2, align 4
  %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load, <4 x i32> zeroinitializer)
  %4 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %wide.load7, <4 x i32> zeroinitializer)
  %5 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %3, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
  %6 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %4, <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
  %7 = trunc nuw <4 x i32> %5 to <4 x i8>
  %8 = trunc nuw <4 x i32> %6 to <4 x i8>
  %9 = getelementptr inbounds i8, ptr %1, i64 4
  store <4 x i8> %7, ptr %1, align 1
  store <4 x i8> %8, ptr %9, align 1
  %index.next = add nuw i64 %index, 8
  %10 = icmp eq i64 %index.next, %n.vec
  br i1 %10, label %middle.block, label %vector.body, !llvm.loop !8

The standard library clamp is implemented as the following.

fn clamp(n: i32, min: i32, max: i32) -> i32 {
    assert!(min <= max);
    if n < min {
        min
    } else if n > max {
        max
    } else {
        n
    }
}

It seems OK to transform the standard library clamp to the manual clamp, minimized IR.
alive2 proof - https://alive2.llvm.org/ce/z/pRRVhU
Rust source - https://rust.godbolt.org/z/3PxW1xWqo

define i8 @src(i32 %input) {
  %1 = icmp slt i32 %input, 0
  %2 = tail call i32 @llvm.umin.i32(i32 %input, i32 255)
  %3 = trunc nuw i32 %2 to i8
  %4 = select i1 %1, i8 0, i8 %3
  ret i8 %4
}

define i8 @tgt(i32 %input) {
  %1 = tail call i32 @llvm.smax.i32(i32 %input, i32 0)
  %2 = tail call i32 @llvm.umin.i32(i32 %1, i32 255)
  %3 = trunc nuw i32 %2 to i8
  ret i8 %3
}

Real world examples from functions in the image-webp crate
https://rust.godbolt.org/z/veGzv1dPx - source
https://rust.godbolt.org/z/sf4v6ceGM - source

Originally reported rust-lang/rust#125738

The text was updated successfully, but these errors were encountered:

braw-lee · 2024-08-22T12:01:29Z

hi, I would like to take this
i will try llvm issue for the first time, not sure if this will be beginner friendly, assign me pls if this is not too advanced

dtcxzyw · 2024-08-22T12:26:05Z

@braw-lee Please read https://llvm.org/docs/InstCombineContributorGuide.html before submitting a patch. Good luck!

braw-lee · 2024-08-23T10:08:28Z

this https://alive2.llvm.org/ce/z/pRRVhU proof is specific for above case i.e range between 0 and 255, and conversion from i32 to i8
I tried to generalize it but poision value invalidates it
https://alive2.llvm.org/ce/z/RSyowR

i don't think we should be performing instruction combining for hard-coded values
can anyone help on how to proceed?

dtcxzyw · 2024-08-23T12:55:15Z

this https://alive2.llvm.org/ce/z/pRRVhU proof is specific for above case i.e range between 0 and 255, and conversion from i32 to i8 I tried to generalize it but poision value invalidates it https://alive2.llvm.org/ce/z/RSyowR

i don't think we should be performing instruction combining for hard-coded values can anyone help on how to proceed?

%min should be less than %max. Alive2: https://alive2.llvm.org/ce/z/aF_hf8
You can use simplifyICmpInst or isImpliedByDomCondition to check this pre-condition. But I think it would be better to start with constant %min/%max :)

braw-lee · 2024-08-29T07:53:29Z

we already have this optimization for constant values
https://rust.godbolt.org/z/v1GY4xjhc

but the thing to notice here is we are using slt and smin
but seems like rust is generating slt and umin, and opt is not matching that

so do we need to fix llvm or rust codegen to get this optimization for constant values? @dtcxzyw

but we do need to add this optimization where clamp ranges are variables and (min < max),
this pattern is optimized for constant values in canonicalizeSPF(), am trying to figure out how to make it work for variables

nikic · 2024-08-29T10:26:06Z

The umin is produced by LLVM not Rust, because both arguments are known non-negative.

I believe the pattern you want to handle is https://alive2.llvm.org/ce/z/qc9GCG (with constant min/max). And then as a next step the variation where there is an additional trunc in between.

braw-lee · 2024-09-07T13:49:12Z

@nikic

define i8 @src(i32 %input) {
  %1 = icmp slt i32 %input, 0
  %2 = tail call i32 @llvm.umin.i32(i32 %input, i32 255)
  %3 = trunc nuw i32 %2 to i8
  %4 = select i1 %1, i8 0, i8 %3
  ret i8 %4
}

i don't get it how both args are known non-negative here, what if the %input here is say -5
how will umin(-5,255) work?

nikic · 2024-09-07T13:50:32Z

@braw-lee The non-negative inputs here are 0 (in the icmp and select) and 255, not %input.

braw-lee · 2024-09-08T07:37:28Z

@nikic
related PR here #107741

so currently what we do for a pattern like this

select( (i <s min) ? min : smin(i, max)) -> smax(min, smin(i, max))

is we take smax of LHS and RHS

so i tried to use this code to handle case when we have signed comparision and unsigned min/max
so we take smax of LHS and RHS(here RHS is unsigned which will create problem)

select( (i <s min) ? min : umin(i, max)) -> smax(min, umin(i, max)) ;(incorrect transformation)
this further gets optimized 
smax(min, umin(i, max)) -> umin(i, max)

but this is incorrect according to alivetv

correct transformation would be these

1. select( (i <s min) ? min : umin(i, max)) -> smax(min, smin(i, max))

;or

2. select( (i <s min) ? min : umin(i, max)) -> umin(max, smax(i, min))

buts its difficult to generate the correct transformations using the current code(creates min/max intrinsic around LHS and RHS)
, should I create a separate match for this pattern or is it better to change the RHS in 1. transformation from umin(i, max) to smin(i, max)

github-actions bot added the new issue label Aug 19, 2024

EugeneZelenko added llvm:optimizations missed-optimization and removed new issue labels Aug 19, 2024

nikic added the llvm:instcombine label Aug 20, 2024

dtcxzyw assigned braw-lee Aug 22, 2024

braw-lee linked a pull request Sep 12, 2024 that will close this issue

Fixes #104875 #107741

Draft

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Missed optimization with autovectorized saturating truncation (clamp) #104875

Missed optimization with autovectorized saturating truncation (clamp) #104875

okaneco commented Aug 19, 2024

braw-lee commented Aug 22, 2024

Uh oh!

dtcxzyw commented Aug 22, 2024

Uh oh!

braw-lee commented Aug 23, 2024

Uh oh!

dtcxzyw commented Aug 23, 2024

Uh oh!

braw-lee commented Aug 29, 2024

Uh oh!

nikic commented Aug 29, 2024

Uh oh!

braw-lee commented Sep 7, 2024

Uh oh!

nikic commented Sep 7, 2024

Uh oh!

braw-lee commented Sep 8, 2024

Uh oh!

Missed optimization with autovectorized saturating truncation (clamp) #104875

Missed optimization with autovectorized saturating truncation (clamp) #104875

Comments

okaneco commented Aug 19, 2024

braw-lee commented Aug 22, 2024

Uh oh!

dtcxzyw commented Aug 22, 2024

Uh oh!

braw-lee commented Aug 23, 2024

Uh oh!

dtcxzyw commented Aug 23, 2024

Uh oh!

braw-lee commented Aug 29, 2024

Uh oh!

nikic commented Aug 29, 2024

Uh oh!

braw-lee commented Sep 7, 2024

Uh oh!

nikic commented Sep 7, 2024

Uh oh!

braw-lee commented Sep 8, 2024

Uh oh!