a case where bounds checks should be eliminated #58602

oconnor663 · 2019-02-20T17:49:22Z

const INPUT: usize = 10;
const OUTPUT: usize = 3;

pub fn get_output1(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
    if offset <= INPUT - OUTPUT {
        // This version optimizes out the extra bounds checks for the slice.
        Some(&input[offset..offset + OUTPUT])
    } else {
        None
    }
}

pub fn get_output2(input: &[u8; INPUT], offset: usize) -> Option<&[u8]> {
    if offset <= INPUT - OUTPUT {
        // This version fails to optimize bounds checks.
        Some(&input[offset..][..OUTPUT])
    } else {
        None
    }
}

This is a simplified version of a pessimization that came up when I was trying to optimize the array_ref! macro: droundy/arrayref#16

The text was updated successfully, but these errors were encountered:

luqmana · 2019-03-27T02:54:53Z

So this is what the LLVM IR looks like optimized:

; foo::get_output1
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output117haa6e1bf361064f05E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
  %0 = icmp ult i64 %offset, 8
  %1 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
  %_0.sroa.0.0 = select i1 %0, i8* %1, i8* null
  %2 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
  %3 = insertvalue { i8*, i64 } %2, i64 3, 1
  ret { i8*, i64 } %3
}

; foo::get_output2
; Function Attrs: nonlazybind uwtable
define { i8*, i64 } @_ZN3foo11get_output217h09c70bf053786004E([10 x i8]* noalias readonly align 1 dereferenceable(10) %input, i64 %offset) unnamed_addr #0 {
start:
  %0 = icmp ult i64 %offset, 8
  br i1 %0, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit", label %bb5

"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit": ; preds = %start
  %1 = sub i64 10, %offset
  %2 = icmp ult i64 %1, 3
  br i1 %2, label %bb4.i.i.i, label %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"

bb4.i.i.i:                                        ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
; call core::slice::slice_index_len_fail
  tail call void @_ZN4core5slice20slice_index_len_fail17h3b63b7b5b8377837E(i64 3, i64 %1)
  unreachable

"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit": ; preds = %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17h0c1ade7fc9d72442E.exit"
  %3 = getelementptr inbounds [10 x i8], [10 x i8]* %input, i64 0, i64 %offset
  br label %bb5

bb5:                                              ; preds = %start, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit"
  %_0.sroa.0.0 = phi i8* [ %3, %"_ZN4core5slice74_$LT$impl$u20$core..ops..index..Index$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$5index17hbbe883cdf661ef0bE.exit" ], [ null, %start ]
  %4 = insertvalue { i8*, i64 } undef, i8* %_0.sroa.0.0, 0
  %5 = insertvalue { i8*, i64 } %4, i64 3, 1
  ret { i8*, i64 } %5
}

Looking at it we can tell that %2 = icmp ult i64 %1, 3 should be implied by the previous %0 = icmp ult i64 %offset, 8 and in fact LLVM does try to do this opt (from a quick looking in multiple passes instcombine, simplifycfg, instsimplify). Unfortunately, it doesn't seem to be able to relate %1 and %offset.

Just manually rewriting that condition as %2 = icmp ugt i64 %offset, 7 causes it to optimize get_output2 to the same as get_output. But that's not exactly a valid general transformation.

nikic · 2019-03-27T08:51:18Z

This could be decomposed into three optimizations: First, it should be possible to determine that the sub is nuw. I think this is something that CVP should be capable of determining, at least in principle. Second, %y = sub nuw C, %x; icmp P %y, C2 should be instcombined to icmp swap(P) %y, C-C2. Finally CSE will see that the conditions are now the same (this part already works).

luqmana · 2019-03-28T05:54:48Z

https://reviews.llvm.org/D59916

luqmana · 2019-04-20T06:26:43Z

The (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C) part of the change landed already in https://reviews.llvm.org/D59916.
The change to teach CVP to add nuw/nsw to sub is at https://reviews.llvm.org/D60036 but unfortunately that optimization is currently disabled by default due to another LLVM bug: https://bugs.llvm.org/show_bug.cgi?id=31181

nikic · 2019-06-29T20:35:08Z

CVP nowrap inference reenabled in https://reviews.llvm.org/D62776, so this issue should be fixed once LLVM is updated.

mati865 · 2019-07-19T19:43:08Z

LLVM was upgraded and both functions generate equal assembly now.

luqmana added the WG-llvm Working group: LLVM backend code generation label Mar 27, 2019

nikic closed this as completed Jul 19, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

a case where bounds checks should be eliminated #58602

a case where bounds checks should be eliminated #58602

oconnor663 commented Feb 20, 2019

luqmana commented Mar 27, 2019

nikic commented Mar 27, 2019

luqmana commented Mar 28, 2019

luqmana commented Apr 20, 2019

nikic commented Jun 29, 2019

mati865 commented Jul 19, 2019

a case where bounds checks should be eliminated #58602

a case where bounds checks should be eliminated #58602

Comments

oconnor663 commented Feb 20, 2019

luqmana commented Mar 27, 2019

nikic commented Mar 27, 2019

luqmana commented Mar 28, 2019

luqmana commented Apr 20, 2019

nikic commented Jun 29, 2019

mati865 commented Jul 19, 2019