From e04a04d95fd575cd9317c453aaa0c180419df47e Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Mon, 10 Mar 2025 06:55:10 -0400
Subject: [PATCH 1/8] [SeparateConstOffsetFromGEP] Preserve inbounds flag based
 on ValueTracking

If we know that the initial GEP was inbounds, and we change it to a
sequence of GEPs from the same base pointer where every offset is
non-negative, then the new GEPs are inbounds.

For SWDEV-516125.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 18 +++++++++++----
 .../AMDGPU/preserve-inbounds.ll               | 23 +++++++++++++++++++
 .../NVPTX/split-gep-and-gvn.ll                | 16 ++++++-------
 .../NVPTX/split-gep.ll                        |  8 +++----
 4 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index e048015298461..70940939be165 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1052,6 +1052,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     }
   }
 
+  bool MayRecoverInbounds = AccumulativeByteOffset >= 0 && GEP->isInBounds();
+
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
   // Notice that we don't remove struct field indices here. If LowerGEP is
@@ -1079,6 +1081,8 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         // and the old index if they are not used.
         RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
         RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+        MayRecoverInbounds =
+            MayRecoverInbounds && computeKnownBits(NewIdx, *DL).isNonNegative();
       }
     }
   }
@@ -1100,11 +1104,15 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // address with silently-wrapping two's complement arithmetic".
   // Therefore, the final code will be a semantically equivalent.
   //
-  // TODO(jingyue): do some range analysis to keep as many inbounds as
-  // possible. GEPs with inbounds are more friendly to alias analysis.
-  // TODO(gep_nowrap): Preserve nuw at least.
-  GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
-  GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+  // If the initial GEP was inbounds and all variable indices and the
+  // accumulated offsets are non-negative, they can be added in any order and
+  // the intermediate results are in bounds. So, we can preserve the inbounds
+  // flag for both GEPs. GEPs with inbounds are more friendly to alias analysis.
+  //
+  // TODO(gep_nowrap): Preserve nuw?
+  GEPNoWrapFlags NewGEPFlags =
+      MayRecoverInbounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none();
+  GEP->setNoWrapFlags(NewGEPFlags);
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
   if (LowerGEP) {
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index 422e5d8215502..01619aa481ddd 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -16,3 +16,26 @@ entry:
   %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
   ret ptr %arrayidx
 }
+
+; All offsets must be positive, so inbounds can be preserved.
+define void @must_be_inbounds(ptr %dst, ptr %src, i32 %i) {
+; CHECK-LABEL: @must_be_inbounds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[I_PROM:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[I_PROM]]
+; CHECK-NEXT:    [[ARRAYIDX_SRC2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_SRC2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[I_PROM]]
+; CHECK-NEXT:    [[ARRAYIDX_DST4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX_DST4]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i.prom = zext i32 %i to i64
+  %idx = add nsw i64 %i.prom, 1
+  %arrayidx.src = getelementptr inbounds float, ptr %src, i64 %idx
+  %3 = load float, ptr %arrayidx.src, align 4
+  %arrayidx.dst = getelementptr inbounds float, ptr %dst, i64 %idx
+  store float %3, ptr %arrayidx.dst, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
index 9a73feb2c4b5c..4474585bf9b06 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -157,19 +157,19 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[TMP0:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
 ; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
 ; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
 ; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; IR-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4
 ; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
 ; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
 ; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
-; IR-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128
 ; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
 ; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
 ; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
-; IR-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132
 ; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
 ; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
 ; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
@@ -224,19 +224,19 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[TMP0:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[TMP1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[TMP2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
+; IR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[TMP1]], i64 [[TMP0]]
 ; IR-NEXT:    [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr
 ; IR-NEXT:    [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4
 ; IR-NEXT:    [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00
-; IR-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4
+; IR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4
 ; IR-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr
 ; IR-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4
 ; IR-NEXT:    [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]]
-; IR-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128
+; IR-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128
 ; IR-NEXT:    [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr
 ; IR-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4
 ; IR-NEXT:    [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]]
-; IR-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132
+; IR-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132
 ; IR-NEXT:    [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr
 ; IR-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
 ; IR-NEXT:    [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]]
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 77b3434f4f159..da04a6e979425 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -372,8 +372,8 @@ define ptr @trunk_explicit(ptr %ptr, i64 %idx) {
 ; CHECK-LABEL: define ptr @trunk_explicit(
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3216
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 3216
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
@@ -389,8 +389,8 @@ define ptr @trunk_long_idx(ptr %ptr, i64 %idx) {
 ; CHECK-LABEL: define ptr @trunk_long_idx(
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3216
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 3216
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:

From b3caba7a296891cf32232e3e9cc784f0848b02d0 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Tue, 11 Mar 2025 10:05:26 -0400
Subject: [PATCH 2/8] Simplify test case and add more

---
 .../AMDGPU/preserve-inbounds.ll               | 126 ++++++++++++++++--
 1 file changed, 112 insertions(+), 14 deletions(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index 01619aa481ddd..6865fcf663c58 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -17,25 +17,123 @@ entry:
   ret ptr %arrayidx
 }
 
-; All offsets must be positive, so inbounds can be preserved.
-define void @must_be_inbounds(ptr %dst, ptr %src, i32 %i) {
+; All indices must be non-negative, so inbounds can be preserved.
+define ptr @must_be_inbounds(ptr %p, i32 %i) {
 ; CHECK-LABEL: @must_be_inbounds(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[I_PROM:%.*]] = zext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[SRC:%.*]], i64 [[I_PROM]]
-; CHECK-NEXT:    [[ARRAYIDX_SRC2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX_SRC2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[DST:%.*]], i64 [[I_PROM]]
-; CHECK-NEXT:    [[ARRAYIDX_DST4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 4
-; CHECK-NEXT:    store float [[TMP1]], ptr [[ARRAYIDX_DST4]], align 4
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[I_PROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
 ;
 entry:
   %i.prom = zext i32 %i to i64
   %idx = add nsw i64 %i.prom, 1
-  %arrayidx.src = getelementptr inbounds float, ptr %src, i64 %idx
-  %3 = load float, ptr %arrayidx.src, align 4
-  %arrayidx.dst = getelementptr inbounds float, ptr %dst, i64 %idx
-  store float %3, ptr %arrayidx.dst, align 4
-  ret void
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; idx must be non-negative -> preserve inbounds
+define ptr @sign_bit_clear(ptr %p, i64 %i) {
+; CHECK-LABEL: @sign_bit_clear(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX:%.*]] = and i64 [[I:%.*]], 9223372036854775807
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX]]
+;
+entry:
+  %idx = and i64 %i, u0x7fffffffffffffff
+  %idx.add = add i64 %idx, 1
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx.add
+  ret ptr %arrayidx
+}
+
+; idx may be negative -> don't preserve inbounds
+define ptr @sign_bit_not_clear(ptr %p, i64 %i) {
+; CHECK-LABEL: @sign_bit_not_clear(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX:%.*]] = and i64 [[I:%.*]], -256
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = and i64 %i, u0xffffffffffffff00
+  %idx.add = add i64 %idx, 1
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx.add
+  ret ptr %arrayidx
+}
+
+; idx may be 0 or very negative -> don't preserve inbounds
+define ptr @only_sign_bit_not_clear(ptr %p, i64 %i) {
+; CHECK-LABEL: @only_sign_bit_not_clear(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX:%.*]] = and i64 [[I:%.*]], -9223372036854775808
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = and i64 %i, u0x8000000000000000
+  %idx.add = add i64 %idx, 1
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx.add
+  ret ptr %arrayidx
+}
+
+; all indices non-negative -> preserve inbounds
+define ptr @multi_level_nonnegative(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @multi_level_nonnegative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASKED_IDX1:%.*]] = and i64 [[IDX1:%.*]], 255
+; CHECK-NEXT:    [[MASKED_IDX2:%.*]] = and i64 [[IDX2:%.*]], 65535
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [10 x [20 x i32]], ptr [[P:%.*]], i64 0, i64 [[MASKED_IDX1]], i64 [[MASKED_IDX2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 180
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %masked.idx1 = and i64 %idx1, u0xff
+  %masked.idx2 = and i64 %idx2, u0xffff
+  %idx1.add = add i64 %masked.idx1, 2
+  %idx2.add = add i64 %masked.idx2, 5
+  %arrayidx = getelementptr inbounds [10 x [20 x i32]], ptr %p, i64 0, i64 %idx1.add, i64 %idx2.add
+  ret ptr %arrayidx
+}
+
+; It doesn't matter that %idx2.add might be negative, the indices in the resulting GEPs are all non-negative -> preserve inbounds
+define ptr @multi_level_mixed_okay(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @multi_level_mixed_okay(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASKED_IDX1:%.*]] = and i64 [[IDX1:%.*]], 255
+; CHECK-NEXT:    [[MASKED_IDX2:%.*]] = and i64 [[IDX2:%.*]], 65535
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [10 x [20 x i32]], ptr [[P:%.*]], i64 0, i64 [[MASKED_IDX1]], i64 [[MASKED_IDX2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 156
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %masked.idx1 = and i64 %idx1, u0xff
+  %masked.idx2 = and i64 %idx2, u0xffff
+  %idx1.add = add i64 %masked.idx1, 2
+  %idx2.add = add i64 %masked.idx2, -1
+  %arrayidx = getelementptr inbounds [10 x [20 x i32]], ptr %p, i64 0, i64 %idx1.add, i64 %idx2.add
+  ret ptr %arrayidx
+}
+
+; One index may be negative -> don't preserve inbounds
+define ptr @multi_level_mixed_not_okay(ptr %p, i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: @multi_level_mixed_not_okay(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MASKED_IDX1:%.*]] = and i64 [[IDX1:%.*]], -256
+; CHECK-NEXT:    [[MASKED_IDX2:%.*]] = and i64 [[IDX2:%.*]], 65535
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [10 x [20 x i32]], ptr [[P:%.*]], i64 0, i64 [[MASKED_IDX1]], i64 [[MASKED_IDX2]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 156
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %masked.idx1 = and i64 %idx1, u0xffffffffffffff00
+  %masked.idx2 = and i64 %idx2, u0xffff
+  %idx1.add = add i64 %masked.idx1, 2
+  %idx2.add = add i64 %masked.idx2, -1
+  %arrayidx = getelementptr inbounds [10 x [20 x i32]], ptr %p, i64 0, i64 %idx1.add, i64 %idx2.add
+  ret ptr %arrayidx
 }

From 69fe986a2c3da7e27d8c51fdf87a0e2d07ced98c Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Fri, 21 Mar 2025 09:33:18 -0400
Subject: [PATCH 3/8] Use isKnownNonNegative, add available information to the
 SimplifyQuery

---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 70940939be165..71a8a08f9aed6 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1082,7 +1082,10 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
         RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
         MayRecoverInbounds =
-            MayRecoverInbounds && computeKnownBits(NewIdx, *DL).isNonNegative();
+            MayRecoverInbounds &&
+            isKnownNonNegative(
+                NewIdx,
+                SimplifyQuery(*DL, TLI, DT, /*AC=*/nullptr, /*CXTI=*/GEP));
       }
     }
   }

From 99df66378ac345759307b8dc04fee3cf38636ed9 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Thu, 27 Mar 2025 05:00:29 -0400
Subject: [PATCH 4/8] Preserve more GEP flags based on NUW

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |  94 +++-
 .../AMDGPU/preserve-inbounds.ll               | 405 +++++++++++++++++-
 2 files changed, 480 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 71a8a08f9aed6..25cf0f23d1908 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -235,8 +235,10 @@ class ConstantOffsetExtractor {
   /// \p GEP The given GEP
   /// \p UserChainTail Outputs the tail of UserChain so that we can
   ///                  garbage-collect unused instructions in UserChain.
+  /// \p PreservesNUW  Outputs whether the extraction allows preserving the
+  ///                  GEP's nuw flag, if it has one.
   static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
-                        User *&UserChainTail);
+                        User *&UserChainTail, bool &PreservesNUW);
 
   /// Looks for a constant offset from the given GEP index without extracting
   /// it. It returns the numeric value of the extracted constant offset (0 if
@@ -267,6 +269,13 @@ class ConstantOffsetExtractor {
   APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
                             bool ZeroExtended);
 
+  /// A helper function to check if a subsequent call to rebuildWithoutConst
+  /// will allow preserving the GEP's nuw flag. That is the case if all
+  /// reassociated binary operations are add nuw and no non-nuw trunc is
+  /// distributed through an add.
+  /// Can only be called after find has populated the UserChain.
+  bool checkRebuildingPreservesNUW() const;
+
   /// After finding the constant offset C from the GEP index I, we build a new
   /// index I' s.t. I' + C = I. This function builds and returns the new
   /// index I' according to UserChain produced by function "find".
@@ -676,6 +685,30 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
   return Current;
 }
 
+bool ConstantOffsetExtractor::checkRebuildingPreservesNUW() const {
+  auto AllowsPreservingNUW = [](User *U) {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+      auto Opcode = BO->getOpcode();
+      if (Opcode == BinaryOperator::Or) {
+        // Ors are only considered here if they are disjoint. The addition that
+        // they represent in this case is NUW.
+        assert(cast<PossiblyDisjointInst>(BO)->isDisjoint());
+        return true;
+      }
+      return Opcode == BinaryOperator::Add && BO->hasNoUnsignedWrap();
+    }
+    // UserChain can only contain ConstantInt, CastInst, or BinaryOperator.
+    // Among the possible CastInsts, only trunc without nuw is a problem: If it
+    // is distributed through an add nuw, wrapping may occur:
+    // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
+    if (TruncInst *TI = dyn_cast<TruncInst>(U))
+      return TI->hasNoUnsignedWrap();
+    return true;
+  };
+
+  return all_of(UserChain, AllowsPreservingNUW);
+}
+
 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
   distributeExtsAndCloneChain(UserChain.size() - 1);
   // Remove all nullptrs (used to be s/zext) from UserChain.
@@ -779,7 +812,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 }
 
 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
-                                        User *&UserChainTail) {
+                                        User *&UserChainTail,
+                                        bool &PreservesNUW) {
   ConstantOffsetExtractor Extractor(GEP->getIterator());
   // Find a non-zero constant offset first.
   APInt ConstantOffset =
@@ -787,8 +821,12 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
                      GEP->isInBounds());
   if (ConstantOffset == 0) {
     UserChainTail = nullptr;
+    PreservesNUW = true;
     return nullptr;
   }
+
+  PreservesNUW = Extractor.checkRebuildingPreservesNUW();
+
   // Separates the constant offset from the GEP index.
   Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
   UserChainTail = Extractor.UserChain.back();
@@ -1052,7 +1090,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
     }
   }
 
-  bool MayRecoverInbounds = AccumulativeByteOffset >= 0 && GEP->isInBounds();
+  // Track information for preserving GEP flags.
+  bool AllOffsetsNonNegative = AccumulativeByteOffset >= 0;
+  bool AllNUWPreserved = true;
 
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
@@ -1072,8 +1112,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
       // uses the variadic part as the new index.
       Value *OldIdx = GEP->getOperand(I);
       User *UserChainTail;
-      Value *NewIdx =
-          ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail);
+      bool PreservesNUW;
+      Value *NewIdx = ConstantOffsetExtractor::Extract(
+          OldIdx, GEP, UserChainTail, PreservesNUW);
       if (NewIdx != nullptr) {
         // Switches to the index with the constant offset removed.
         GEP->setOperand(I, NewIdx);
@@ -1081,11 +1122,9 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
         // and the old index if they are not used.
         RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
         RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
-        MayRecoverInbounds =
-            MayRecoverInbounds &&
-            isKnownNonNegative(
-                NewIdx,
-                SimplifyQuery(*DL, TLI, DT, /*AC=*/nullptr, /*CXTI=*/GEP));
+        AllOffsetsNonNegative =
+            AllOffsetsNonNegative && isKnownNonNegative(NewIdx, *DL);
+        AllNUWPreserved &= PreservesNUW;
       }
     }
   }
@@ -1106,15 +1145,34 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   // inbounds keyword is not present, the offsets are added to the base
   // address with silently-wrapping two's complement arithmetic".
   // Therefore, the final code will be a semantically equivalent.
-  //
-  // If the initial GEP was inbounds and all variable indices and the
+  GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none();
+
+  // If the initial GEP was inbounds/nusw and all variable indices and the
   // accumulated offsets are non-negative, they can be added in any order and
-  // the intermediate results are in bounds. So, we can preserve the inbounds
-  // flag for both GEPs. GEPs with inbounds are more friendly to alias analysis.
-  //
-  // TODO(gep_nowrap): Preserve nuw?
-  GEPNoWrapFlags NewGEPFlags =
-      MayRecoverInbounds ? GEPNoWrapFlags::inBounds() : GEPNoWrapFlags::none();
+  // the intermediate results are in bounds and don't overflow in a nusw sense.
+  // So, we can preserve the inbounds/nusw flag for both GEPs.
+  bool CanPreserveInBoundsNUSW = AllOffsetsNonNegative;
+
+  // If the initial GEP was NUW and all operations that we reassociate were NUW
+  // additions, the resulting GEPs are also NUW.
+  if (GEP->hasNoUnsignedWrap() && AllNUWPreserved) {
+    NewGEPFlags |= GEPNoWrapFlags::noUnsignedWrap();
+    // If the initial GEP additionally had NUSW (or inbounds, which implies
+    // NUSW), we know that the indices in the initial GEP must all have their
+    // signbit not set. For indices that are the result of NUW adds, the
+    // add-operands therefore also don't have their signbit set. Therefore, all
+    // indices of the resulting GEPs are non-negative -> we can preserve
+    // the inbounds/nusw flag.
+    CanPreserveInBoundsNUSW |= GEP->hasNoUnsignedSignedWrap();
+  }
+
+  if (CanPreserveInBoundsNUSW) {
+    if (GEP->isInBounds())
+      NewGEPFlags |= GEPNoWrapFlags::inBounds();
+    else if (GEP->hasNoUnsignedSignedWrap())
+      NewGEPFlags |= GEPNoWrapFlags::noUnsignedSignedWrap();
+  }
+
   GEP->setNoWrapFlags(NewGEPFlags);
 
   // Lowers a GEP to either GEPs with a single index or arithmetic operations.
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index 6865fcf663c58..a4f3e2b913116 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=separate-const-offset-from-gep -S | FileCheck %s
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=separate-const-offset-from-gep -S | FileCheck %s
+
+; gfx1200 is particularly interesting since it allows negative immediate offsets
+; in flat instructions, so the transformation is applied in more cases.
 
 ; The inbounds flags cannot be preserved here: If the pointers point to the
 ; beginning of an object and %i is 1, the intermediate GEPs are out of bounds.
@@ -137,3 +140,403 @@ entry:
   %arrayidx = getelementptr inbounds [10 x [20 x i32]], ptr %p, i64 0, i64 %idx1.add, i64 %idx2.add
   ret ptr %arrayidx
 }
+
+
+define ptr @nuw_implies_nuw(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_implies_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, 1
+  %arrayidx = getelementptr nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_implies_nuw_negative(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_implies_nuw_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr nuw i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, -16
+  %arrayidx = getelementptr nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; This is poison anyway, so we can preserve the flags.
+define ptr @nuw_inbounds_implies_nuw_inbounds_negative(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, -16
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_nusw_implies_nuw_nusw(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_nusw_implies_nuw_nusw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nusw nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, 1
+  %arrayidx = getelementptr nusw nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; Also poison.
+define ptr @nuw_implies_nuw_nusw_negative(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_implies_nuw_nusw_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nusw nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i64 %i, -16
+  %arrayidx = getelementptr nusw nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_ordisjoint(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_ordisjoint(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = or disjoint i64 %i, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; This is poison anyway, so we can do the transformation.
+define ptr @nuw_inbounds_implies_nuw_inbounds_ordisjoint_negative(ptr %p, i64 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_ordisjoint_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = or disjoint i64 %i, -16
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; Check that nothing happens for non-disjoint ors
+define ptr @or_no_disjoint(ptr %p, i64 %i) {
+; CHECK-LABEL: @or_no_disjoint(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX:%.*]] = or i64 [[I:%.*]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    ret ptr [[ARRAYIDX]]
+;
+entry:
+  %idx = or i64 %i, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+define ptr @no_nuw_inbounds_for_sub(ptr %p, i64 %i) {
+; CHECK-LABEL: @no_nuw_inbounds_for_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = sub nuw i64 %i, 1
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+define ptr @no_nuw_inbounds_for_sub_negative(ptr %p, i64 %i) {
+; CHECK-LABEL: @no_nuw_inbounds_for_sub_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[I:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = sub nuw i64 %i, -16
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
+  ret ptr %arrayidx
+}
+
+; Can't preserved nuw and other flags here as distributing the trunc towards the
+; leaves can introduce new wraps.
+define ptr @nuw_inbounds_trunc(ptr %p, i128 %i) {
+; CHECK-LABEL: @nuw_inbounds_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i128 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i128 %i, 1
+  %idx.conv = trunc i128 %idx to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+; trunc nuw is not a problem.
+define ptr @nuw_inbounds_implies_nuw_inbounds_trunc_nuw(ptr %p, i128 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_trunc_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw i128 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i128 %i, 1
+  %idx.conv = trunc nuw i128 %idx to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_sext(ptr %p, i32 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_sext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i32 %i, 1
+  %idx.conv = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_zext(ptr %p, i32 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i32 %i, 1
+  %idx.conv = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_zext_negative(ptr %p, i8 %i) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_zext_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IDX_CONV:%.*]] = zext i8 [[I:%.*]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[IDX_CONV]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 960
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+entry:
+  %idx = add nuw i8 %i, -16
+  %idx.conv = zext i8 %idx to i64
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+
+; This test and the following ones mask most bits of %v off to facilitate
+; validation with alive2 while still allowing interesting values.
+define ptr @nuw_inbounds_implies_nuw_inbounds_nested(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_nested(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V_MASKED:%.*]] = and i64 [[V:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V_MASKED]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_nested_negative(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_nested_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_implies_nuw_nested(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_implies_nuw_nested(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_implies_nuw_nested_negative(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_implies_nuw_nested_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nuw i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, -16
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_nusw_implies_nuw_nusw_nested(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_nusw_implies_nuw_nusw_nested(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nusw nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr nusw nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_nusw_implies_nuw_nusw_nested_negative(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_nusw_implies_nuw_nusw_nested_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nusw nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add nuw i64 %idx1, %v.masked
+  %arrayidx = getelementptr nusw nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+
+; Neither inbounds nor nuw can be preserved.
+define ptr @nuw_inbounds_nested_not_all_nuw(ptr %p, i64 %i, i64 %v) {
+; CHECK-LABEL: @nuw_inbounds_nested_not_all_nuw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
+; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX22]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+entry:
+  %v.masked = and i64 %v, u0xf0000003c000000f
+  %idx1 = add nuw i64 %i, 1
+  %idx2 = add i64 %idx1, %v.masked
+  %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx2
+  ret ptr %arrayidx
+}
+
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_multilevel(ptr %src, i64 %i1, i64 %i2) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_multilevel(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[SRC:%.*]], i64 [[I1:%.*]], i64 [[I2:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 24
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+  %idx1 = add nuw i64 %i1, 1
+  %idx2 = add nuw i64 2, %i2
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr %src, i64 %idx1, i64 %idx2
+  ret ptr %arrayidx
+}
+
+; Neither inbounds nor nuw can be preserved.
+define ptr @nuw_inbounds_multilevel_not_all_nuw(ptr %src, i64 %i1, i64 %i2) {
+; CHECK-LABEL: @nuw_inbounds_multilevel_not_all_nuw(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [4 x i32], ptr [[SRC:%.*]], i64 [[I1:%.*]], i64 [[I2:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24
+; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
+;
+  %idx1 = add nuw i64 %i1, 1
+  %idx2 = add i64 2, %i2
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr %src, i64 %idx1, i64 %idx2
+  ret ptr %arrayidx
+}
+
+; Missing information about non-extracted indices does not matter.
+define ptr @nuw_inbounds_implies_nuw_inbounds_multilevel_one_unfolded(ptr %src, i64 %i1, i64 %v) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_multilevel_one_unfolded(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[SRC:%.*]], i64 [[I1:%.*]], i64 [[V:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+  %idx1 = add nuw i64 %i1, 1
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr %src, i64 %idx1, i64 %v
+  ret ptr %arrayidx
+}
+
+define ptr @nuw_inbounds_implies_nuw_inbounds_multilevel_other_unfolded(ptr %src, i64 %i1, i64 %v) {
+; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_multilevel_other_unfolded(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[SRC:%.*]], i64 [[V:%.*]], i64 [[I1:%.*]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+  %idx1 = add nuw i64 %i1, 1
+  %arrayidx = getelementptr inbounds nuw [4 x i32], ptr %src, i64 %v, i64 %idx1
+  ret ptr %arrayidx
+}

From 56dbc9a86b4927518f011dc9f425076d083271cd Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Mon, 31 Mar 2025 07:01:23 -0400
Subject: [PATCH 5/8] Change some _negative tests to actually use negative
 numbers

---
 .../AMDGPU/preserve-inbounds.ll                           | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
index a4f3e2b913116..2a5b678e91fd8 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@@ -394,12 +394,12 @@ define ptr @nuw_inbounds_implies_nuw_inbounds_nested_negative(ptr %p, i64 %i, i6
 ; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
 ; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 -64
 ; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
 ;
 entry:
   %v.masked = and i64 %v, u0xf0000003c000000f
-  %idx1 = add nuw i64 %i, 1
+  %idx1 = add nuw i64 %i, -16
   %idx2 = add nuw i64 %idx1, %v.masked
   %arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx2
   ret ptr %arrayidx
@@ -462,12 +462,12 @@ define ptr @nuw_nusw_implies_nuw_nusw_nested_negative(ptr %p, i64 %i, i64 %v) {
 ; CHECK-NEXT:    [[V:%.*]] = and i64 [[V1:%.*]], -1152921488500719601
 ; CHECK-NEXT:    [[IDX22:%.*]] = add i64 [[I:%.*]], [[V]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr nusw nuw i32, ptr [[P:%.*]], i64 [[IDX22]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr nusw nuw i8, ptr [[TMP0]], i64 -64
 ; CHECK-NEXT:    ret ptr [[ARRAYIDX3]]
 ;
 entry:
   %v.masked = and i64 %v, u0xf0000003c000000f
-  %idx1 = add nuw i64 %i, 1
+  %idx1 = add nuw i64 %i, -16
   %idx2 = add nuw i64 %idx1, %v.masked
   %arrayidx = getelementptr nusw nuw i32, ptr %p, i64 %idx2
   ret ptr %arrayidx

From 871264cbb4bc43392e9611d432262f3e68ddd515 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Thu, 10 Apr 2025 08:24:19 -0400
Subject: [PATCH 6/8] Simplify the checkRebuildingPreservesNUW computation

---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 57 ++++++++-----------
 1 file changed, 25 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 25cf0f23d1908..954a0e78404b8 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -269,13 +269,6 @@ class ConstantOffsetExtractor {
   APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
                             bool ZeroExtended);
 
-  /// A helper function to check if a subsequent call to rebuildWithoutConst
-  /// will allow preserving the GEP's nuw flag. That is the case if all
-  /// reassociated binary operations are add nuw and no non-nuw trunc is
-  /// distributed through an add.
-  /// Can only be called after find has populated the UserChain.
-  bool checkRebuildingPreservesNUW() const;
-
   /// After finding the constant offset C from the GEP index I, we build a new
   /// index I' s.t. I' + C = I. This function builds and returns the new
   /// index I' according to UserChain produced by function "find".
@@ -685,30 +678,6 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
   return Current;
 }
 
-bool ConstantOffsetExtractor::checkRebuildingPreservesNUW() const {
-  auto AllowsPreservingNUW = [](User *U) {
-    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
-      auto Opcode = BO->getOpcode();
-      if (Opcode == BinaryOperator::Or) {
-        // Ors are only considered here if they are disjoint. The addition that
-        // they represent in this case is NUW.
-        assert(cast<PossiblyDisjointInst>(BO)->isDisjoint());
-        return true;
-      }
-      return Opcode == BinaryOperator::Add && BO->hasNoUnsignedWrap();
-    }
-    // UserChain can only contain ConstantInt, CastInst, or BinaryOperator.
-    // Among the possible CastInsts, only trunc without nuw is a problem: If it
-    // is distributed through an add nuw, wrapping may occur:
-    // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
-    if (TruncInst *TI = dyn_cast<TruncInst>(U))
-      return TI->hasNoUnsignedWrap();
-    return true;
-  };
-
-  return all_of(UserChain, AllowsPreservingNUW);
-}
-
 Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
   distributeExtsAndCloneChain(UserChain.size() - 1);
   // Remove all nullptrs (used to be s/zext) from UserChain.
@@ -811,6 +780,30 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   return NewBO;
 }
 
+/// A helper function to check if reassociating through an entry in the user
+/// chain would invalidate the GEP's nuw flag.
+static bool allowsPreservingNUW(User *U) {
+  assert(isa<BinaryOperator>(U) || isa<CastInst>(U) || isa<ConstantInt>(U));
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+    // Binary operations needd to be effectively add nuw.
+    auto Opcode = BO->getOpcode();
+    if (Opcode == BinaryOperator::Or) {
+      // Ors are only considered here if they are disjoint. The addition that
+      // they represent in this case is NUW.
+      assert(cast<PossiblyDisjointInst>(BO)->isDisjoint());
+      return true;
+    }
+    return Opcode == BinaryOperator::Add && BO->hasNoUnsignedWrap();
+  }
+  // UserChain can only contain ConstantInt, CastInst, or BinaryOperator.
+  // Among the possible CastInsts, only trunc without nuw is a problem: If it
+  // is distributed through an add nuw, wrapping may occur:
+  // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
+  if (TruncInst *TI = dyn_cast<TruncInst>(U))
+    return TI->hasNoUnsignedWrap();
+  return true;
+}
+
 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
                                         User *&UserChainTail,
                                         bool &PreservesNUW) {
@@ -825,7 +818,7 @@ Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
     return nullptr;
   }
 
-  PreservesNUW = Extractor.checkRebuildingPreservesNUW();
+  PreservesNUW = all_of(Extractor.UserChain, allowsPreservingNUW);
 
   // Separates the constant offset from the GEP index.
   Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();

From dba2b315ef64a4fe28d11871b4f63a733c89a158 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Thu, 10 Apr 2025 10:06:25 -0400
Subject: [PATCH 7/8] Remove isa assert, fix comment typo

---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 954a0e78404b8..3556a4d152f1c 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -783,9 +783,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 /// A helper function to check if reassociating through an entry in the user
 /// chain would invalidate the GEP's nuw flag.
 static bool allowsPreservingNUW(User *U) {
-  assert(isa<BinaryOperator>(U) || isa<CastInst>(U) || isa<ConstantInt>(U));
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
-    // Binary operations needd to be effectively add nuw.
+    // Binary operations need to be effectively add nuw.
     auto Opcode = BO->getOpcode();
     if (Opcode == BinaryOperator::Or) {
       // Ors are only considered here if they are disjoint. The addition that
@@ -801,7 +800,7 @@ static bool allowsPreservingNUW(User *U) {
   // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
   if (TruncInst *TI = dyn_cast<TruncInst>(U))
     return TI->hasNoUnsignedWrap();
-  return true;
+  return isa<CastInst>(U) || isa<ConstantInt>(U);
 }
 
 Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,

From 7148a368cf74c0db1adabf62f8c60cccdf570cf0 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter@amd.com>
Date: Mon, 14 Apr 2025 04:11:13 -0400
Subject: [PATCH 8/8] "User *U" -> "const User *U"

---
 llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 3556a4d152f1c..320b79203c0b3 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -782,8 +782,8 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
 
 /// A helper function to check if reassociating through an entry in the user
 /// chain would invalidate the GEP's nuw flag.
-static bool allowsPreservingNUW(User *U) {
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+static bool allowsPreservingNUW(const User *U) {
+  if (const BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
     // Binary operations need to be effectively add nuw.
     auto Opcode = BO->getOpcode();
     if (Opcode == BinaryOperator::Or) {
@@ -798,7 +798,7 @@ static bool allowsPreservingNUW(User *U) {
   // Among the possible CastInsts, only trunc without nuw is a problem: If it
   // is distributed through an add nuw, wrapping may occur:
   // "add nuw trunc(a), trunc(b)" is more poisonous than "trunc(add nuw a, b)"
-  if (TruncInst *TI = dyn_cast<TruncInst>(U))
+  if (const TruncInst *TI = dyn_cast<TruncInst>(U))
     return TI->hasNoUnsignedWrap();
   return isa<CastInst>(U) || isa<ConstantInt>(U);
 }