diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 92f3865818530..cd0fb1b82ddfe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
@@ -867,13 +868,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
 
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine(
-      {ISD::ADD,          ISD::AND,           ISD::EXTRACT_VECTOR_ELT,
-       ISD::FADD,         ISD::FMAXNUM,       ISD::FMINNUM,
-       ISD::FMAXIMUM,     ISD::FMINIMUM,      ISD::FMAXIMUMNUM,
-       ISD::FMINIMUMNUM,  ISD::MUL,           ISD::SHL,
-       ISD::SREM,         ISD::UREM,          ISD::VSELECT,
-       ISD::BUILD_VECTOR, ISD::ADDRSPACECAST, ISD::LOAD,
-       ISD::STORE,        ISD::ZERO_EXTEND,   ISD::SIGN_EXTEND});
+      {ISD::ADD,         ISD::AND,          ISD::EXTRACT_VECTOR_ELT,
+       ISD::FADD,        ISD::FMAXNUM,      ISD::FMINNUM,
+       ISD::FMAXIMUM,    ISD::FMINIMUM,     ISD::FMAXIMUMNUM,
+       ISD::FMINIMUMNUM, ISD::MUL,          ISD::SELECT,
+       ISD::SHL,         ISD::SREM,         ISD::UREM,
+       ISD::VSELECT,     ISD::BUILD_VECTOR, ISD::ADDRSPACECAST,
+       ISD::LOAD,        ISD::STORE,        ISD::ZERO_EXTEND,
+       ISD::SIGN_EXTEND});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6233,6 +6235,61 @@ static SDValue PerformEXTRACTCombine(SDNode *N,
   return Result;
 }
 
+/// Transform patterns like:
+///   (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))
+///   (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)
+/// Into:
+///   (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)
+///
+/// These patterns arise from C/C++ code like `shift >= 32 ? 0 : x >> shift`
+/// which guards against undefined behavior. PTX shr/shl instructions clamp
+/// shift amounts >= BitWidth to produce 0 for logical shifts, making the
+/// guard redundant.
+///
+/// Note: We only handle SRL and SHL, not SRA, because arithmetic right
+/// shifts could produce 0 or -1 when shift >= BitWidth.
+/// Note: We don't handle uge or ule. These don't appear because of
+/// canonicalization.
+static SDValue PerformSELECTShiftCombine(SDNode *N,
+                                         TargetLowering::DAGCombinerInfo &DCI) {
+  if (!DCI.isAfterLegalizeDAG())
+    return SDValue();
+
+  using namespace SDPatternMatch;
+  unsigned BitWidth = N->getValueType(0).getSizeInBits();
+  SDValue ShiftAmt, ShiftOp;
+
+  // Match logical shifts where the shift amount in the guard matches the shift
+  // amount in the operation.
+  auto LogicalShift =
+      m_AllOf(m_Value(ShiftOp),
+              m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),
+                      m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));
+
+  // shift_amt > BitWidth-1 ? 0 : shift_op
+  bool MatchedUGT =
+      sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
+                                   m_SpecificInt(APInt(BitWidth, BitWidth - 1)),
+                                   m_SpecificCondCode(ISD::SETUGT)),
+                           m_Zero(), LogicalShift));
+  // shift_amt < BitWidth ? shift_op : 0
+  bool MatchedULT =
+      !MatchedUGT &&
+      sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
+                                   m_SpecificInt(APInt(BitWidth, BitWidth)),
+                                   m_SpecificCondCode(ISD::SETULT)),
+                           LogicalShift, m_Zero()));
+
+  if (!MatchedUGT && !MatchedULT)
+    return SDValue();
+
+  // Return a clamp shift operation, which has the same semantics as PTX shift.
+  unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP
+                                                      : NVPTXISD::SHL_CLAMP;
+  return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),
+                         ShiftOp.getOperand(0), ShiftOp.getOperand(1));
+}
+
 static SDValue PerformVSELECTCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI) {
   SDValue VA = N->getOperand(1);
@@ -6544,6 +6601,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
   case NVPTXISD::StoreV2:
   case NVPTXISD::StoreV4:
     return combineSTORE(N, DCI, STI);
+  case ISD::SELECT:
+    return PerformSELECTShiftCombine(N, DCI);
   case ISD::VSELECT:
     return PerformVSELECTCombine(N, DCI);
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9c1ec38fb1f3a..74a552502ccf2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1323,6 +1323,15 @@ defm SHL : SHIFT<"shl.b", shl>;
 defm SRA : SHIFT<"shr.s", sra>;
 defm SRL : SHIFT<"shr.u", srl>;
 
+// Shift with clamping semantics - these have defined behavior for shift amounts
+// >= BitWidth (returning 0 for logical shifts). Used to optimize guarded shift
+// patterns like `shift >= 32 ? 0 : x >> shift`.
+def shl_clamp : SDNode<"NVPTXISD::SHL_CLAMP", SDTIntShiftOp, []>;
+def srl_clamp : SDNode<"NVPTXISD::SRL_CLAMP", SDTIntShiftOp, []>;
+
+defm SHL_CLAMP : SHIFT<"shl.b", shl_clamp>;
+defm SRL_CLAMP : SHIFT<"shr.u", srl_clamp>;
+
 // Bit-reverse
 foreach t = [I64RT, I32RT] in
   def BREV_ # t.PtxType :
diff --git a/llvm/test/CodeGen/NVPTX/shift-opt.ll b/llvm/test/CodeGen/NVPTX/shift-opt.ll
index e0d22c62993ba..554839436f451 100644
--- a/llvm/test/CodeGen/NVPTX/shift-opt.ll
+++ b/llvm/test/CodeGen/NVPTX/shift-opt.ll
@@ -171,3 +171,382 @@ define i64 @test_negative_use_shl(i64 %x, i32 %y) {
   call void @use(i64 %shl)
   ret i64 %srl
 }
+
+;; ============================================================================
+;; Guarded shift patterns
+;;
+;; C/C++ code like `shift >= 32 ? 0 : x >> shift` generates a guarded shift
+;; pattern to avoid undefined behavior. PTX shr/shl instructions clamp shift
+;; amounts >= BitWidth to produce 0, making the guard redundant.
+;;
+;; Transformation 1 (ugt form):
+;;   (select (icmp ugt shift, BitWidth-1), 0, (srl x, shift))
+;;   i.e., shift > 31 ? 0 : x >> shift
+;;   -->  (srl x, shift)
+;;
+;; Transformation 2 (ult form):
+;;   (select (icmp ult shift, BitWidth), (srl x, shift), 0)
+;;   i.e., shift < 32 ? x >> shift : 0
+;;   -->  (srl x, shift)
+;;
+;; Same transformation applies to left shifts.
+;; ============================================================================
+
+;; --- i8 shr tests (negative - guard must remain) ---
+
+; Do NOT optimize - PTX uses 16-bit registers, clamping happens at 16 not 8
+define i8 @test_guarded_i8_ugt(i8 %x, i8 %shift) {
+; CHECK-LABEL: test_guarded_i8_ugt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_guarded_i8_ugt_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [test_guarded_i8_ugt_param_1];
+; CHECK-NEXT:    setp.gt.u32 %p1, %r1, 7;
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    selp.b16 %rs3, 0, %rs2, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i8 %shift, 7
+  %shr = lshr i8 %x, %shift
+  %sel = select i1 %cmp, i8 0, i8 %shr
+  ret i8 %sel
+}
+
+; Do NOT optimize - PTX uses 16-bit registers, clamping happens at 16 not 8
+define i8 @test_guarded_i8_ult(i8 %x, i8 %shift) {
+; CHECK-LABEL: test_guarded_i8_ult(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_guarded_i8_ult_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [test_guarded_i8_ult_param_1];
+; CHECK-NEXT:    setp.lt.u32 %p1, %r1, 8;
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    selp.b16 %rs3, %rs2, 0, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i8 %shift, 8
+  %shr = lshr i8 %x, %shift
+  %sel = select i1 %cmp, i8 %shr, i8 0
+  ret i8 %sel
+}
+
+;; --- i16 shr tests ---
+
+; (select (ugt shift, 15), 0, (srl x, shift)) --> (srl x, shift)
+define i16 @test_guarded_i16_ugt(i16 %x, i16 %shift) {
+; CHECK-LABEL: test_guarded_i16_ugt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_guarded_i16_ugt_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [test_guarded_i16_ugt_param_1];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i16 %shift, 15
+  %shr = lshr i16 %x, %shift
+  %sel = select i1 %cmp, i16 0, i16 %shr
+  ret i16 %sel
+}
+
+; (select (ult shift, 16), (srl x, shift), 0) --> (srl x, shift)
+define i16 @test_guarded_i16_ult(i16 %x, i16 %shift) {
+; CHECK-LABEL: test_guarded_i16_ult(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_guarded_i16_ult_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [test_guarded_i16_ult_param_1];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i16 %shift, 16
+  %shr = lshr i16 %x, %shift
+  %sel = select i1 %cmp, i16 %shr, i16 0
+  ret i16 %sel
+}
+
+;; --- i32 shr tests ---
+
+; (select (ugt shift, 31), 0, (srl x, shift)) --> (srl x, shift)
+define i32 @test_guarded_i32_ugt(i32 %x, i32 %shift) {
+; CHECK-LABEL: test_guarded_i32_ugt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i32_ugt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_guarded_i32_ugt_param_1];
+; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i32 %shift, 31
+  %shr = lshr i32 %x, %shift
+  %sel = select i1 %cmp, i32 0, i32 %shr
+  ret i32 %sel
+}
+
+; (select (ult shift, 32), (srl x, shift), 0) --> (srl x, shift)
+define i32 @test_guarded_i32_ult(i32 %x, i32 %shift) {
+; CHECK-LABEL: test_guarded_i32_ult(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i32_ult_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_guarded_i32_ult_param_1];
+; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i32 %shift, 32
+  %shr = lshr i32 %x, %shift
+  %sel = select i1 %cmp, i32 %shr, i32 0
+  ret i32 %sel
+}
+
+;; --- i64 shr tests ---
+
+; (select (ugt shift, 63), 0, (srl x, shift)) --> (srl x, shift)
+define i64 @test_guarded_i64_ugt(i64 %x, i64 %shift) {
+; CHECK-LABEL: test_guarded_i64_ugt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_guarded_i64_ugt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i64_ugt_param_1];
+; CHECK-NEXT:    shr.u64 %rd2, %rd1, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i64 %shift, 63
+  %shr = lshr i64 %x, %shift
+  %sel = select i1 %cmp, i64 0, i64 %shr
+  ret i64 %sel
+}
+
+; (select (ult shift, 64), (srl x, shift), 0) --> (srl x, shift)
+define i64 @test_guarded_i64_ult(i64 %x, i64 %shift) {
+; CHECK-LABEL: test_guarded_i64_ult(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_guarded_i64_ult_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i64_ult_param_1];
+; CHECK-NEXT:    shr.u64 %rd2, %rd1, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i64 %shift, 64
+  %shr = lshr i64 %x, %shift
+  %sel = select i1 %cmp, i64 %shr, i64 0
+  ret i64 %sel
+}
+
+;; --- i8 shl tests (negative - guard must remain) ---
+
+; Do NOT optimize - PTX uses 16-bit registers, clamping happens at 16 not 8
+define i8 @test_guarded_i8_ugt_shl(i8 %x, i8 %shift) {
+; CHECK-LABEL: test_guarded_i8_ugt_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_guarded_i8_ugt_shl_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [test_guarded_i8_ugt_shl_param_1];
+; CHECK-NEXT:    setp.gt.u32 %p1, %r1, 7;
+; CHECK-NEXT:    shl.b16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    selp.b16 %rs3, 0, %rs2, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i8 %shift, 7
+  %shl = shl i8 %x, %shift
+  %sel = select i1 %cmp, i8 0, i8 %shl
+  ret i8 %sel
+}
+
+; Do NOT optimize - PTX uses 16-bit registers, clamping happens at 16 not 8
+define i8 @test_guarded_i8_ult_shl(i8 %x, i8 %shift) {
+; CHECK-LABEL: test_guarded_i8_ult_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b8 %rs1, [test_guarded_i8_ult_shl_param_0];
+; CHECK-NEXT:    ld.param.b8 %r1, [test_guarded_i8_ult_shl_param_1];
+; CHECK-NEXT:    setp.lt.u32 %p1, %r1, 8;
+; CHECK-NEXT:    shl.b16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    selp.b16 %rs3, %rs2, 0, %p1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs3;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i8 %shift, 8
+  %shl = shl i8 %x, %shift
+  %sel = select i1 %cmp, i8 %shl, i8 0
+  ret i8 %sel
+}
+
+;; --- i16 shl tests ---
+
+; (select (ugt shift, 15), 0, (shl x, shift)) --> (shl x, shift)
+define i16 @test_guarded_i16_ugt_shl(i16 %x, i16 %shift) {
+; CHECK-LABEL: test_guarded_i16_ugt_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_guarded_i16_ugt_shl_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [test_guarded_i16_ugt_shl_param_1];
+; CHECK-NEXT:    shl.b16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i16 %shift, 15
+  %shl = shl i16 %x, %shift
+  %sel = select i1 %cmp, i16 0, i16 %shl
+  ret i16 %sel
+}
+
+; (select (ult shift, 16), (shl x, shift), 0) --> (shl x, shift)
+define i16 @test_guarded_i16_ult_shl(i16 %x, i16 %shift) {
+; CHECK-LABEL: test_guarded_i16_ult_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b16 %rs1, [test_guarded_i16_ult_shl_param_0];
+; CHECK-NEXT:    ld.param.b16 %r1, [test_guarded_i16_ult_shl_param_1];
+; CHECK-NEXT:    shl.b16 %rs2, %rs1, %r1;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i16 %shift, 16
+  %shl = shl i16 %x, %shift
+  %sel = select i1 %cmp, i16 %shl, i16 0
+  ret i16 %sel
+}
+
+;; --- i32 shl tests ---
+
+; (select (ugt shift, 31), 0, (shl x, shift)) --> (shl x, shift)
+define i32 @test_guarded_i32_ugt_shl(i32 %x, i32 %shift) {
+; CHECK-LABEL: test_guarded_i32_ugt_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i32_ugt_shl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_guarded_i32_ugt_shl_param_1];
+; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i32 %shift, 31
+  %shl = shl i32 %x, %shift
+  %sel = select i1 %cmp, i32 0, i32 %shl
+  ret i32 %sel
+}
+
+; (select (ult shift, 32), (shl x, shift), 0) --> (shl x, shift)
+define i32 @test_guarded_i32_ult_shl(i32 %x, i32 %shift) {
+; CHECK-LABEL: test_guarded_i32_ult_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i32_ult_shl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_guarded_i32_ult_shl_param_1];
+; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i32 %shift, 32
+  %shl = shl i32 %x, %shift
+  %sel = select i1 %cmp, i32 %shl, i32 0
+  ret i32 %sel
+}
+
+;; --- i64 shl tests ---
+
+; (select (ugt shift, 63), 0, (shl x, shift)) --> (shl x, shift)
+define i64 @test_guarded_i64_ugt_shl(i64 %x, i64 %shift) {
+; CHECK-LABEL: test_guarded_i64_ugt_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_guarded_i64_ugt_shl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i64_ugt_shl_param_1];
+; CHECK-NEXT:    shl.b64 %rd2, %rd1, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ugt i64 %shift, 63
+  %shl = shl i64 %x, %shift
+  %sel = select i1 %cmp, i64 0, i64 %shl
+  ret i64 %sel
+}
+
+; (select (ult shift, 64), (shl x, shift), 0) --> (shl x, shift)
+define i64 @test_guarded_i64_ult_shl(i64 %x, i64 %shift) {
+; CHECK-LABEL: test_guarded_i64_ult_shl(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_guarded_i64_ult_shl_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i64_ult_shl_param_1];
+; CHECK-NEXT:    shl.b64 %rd2, %rd1, %r1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i64 %shift, 64
+  %shl = shl i64 %x, %shift
+  %sel = select i1 %cmp, i64 %shl, i64 0
+  ret i64 %sel
+}
+
+; Verify that the optimization does not apply when the shift in the guard is different from the shift in the shift operation.
+define i64 @test_guarded_i64_ult_shl_different_shift(i64 %x, i64 %shift1, i64 %shift2) {
+; CHECK-LABEL: test_guarded_i64_ult_shl_different_shift(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_guarded_i64_ult_shl_different_shift_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_guarded_i64_ult_shl_different_shift_param_1];
+; CHECK-NEXT:    setp.lt.u64 %p1, %rd2, 64;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_guarded_i64_ult_shl_different_shift_param_2];
+; CHECK-NEXT:    shl.b64 %rd3, %rd1, %r1;
+; CHECK-NEXT:    selp.b64 %rd4, %rd3, 0, %p1;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
+; CHECK-NEXT:    ret;
+  %cmp = icmp ult i64 %shift1, 64
+  %shl = shl i64 %x, %shift2
+  %sel = select i1 %cmp, i64 %shl, i64 0
+  ret i64 %sel
+}