diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 15a2370e5d8b8..aa3668d3e9aae 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1069,7 +1069,8 @@ class SelectionDAG { SDValue EVL); /// Returns sum of the base pointer and offset. - /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap by default. + /// Unlike getObjectPtrOffset this does not set NoUnsignedWrap and InBounds by + /// default. SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags = SDNodeFlags()); SDValue getMemBasePlusOffset(SDValue Base, SDValue Offset, const SDLoc &DL, @@ -1077,15 +1078,18 @@ class SelectionDAG { /// Create an add instruction with appropriate flags when used for /// addressing some offset of an object. i.e. if a load is split into multiple - /// components, create an add nuw from the base pointer to the offset. + /// components, create an add nuw inbounds from the base pointer to the + /// offset. SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset) { - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, SDValue Offset) { // The object itself can't wrap around the address space, so it shouldn't be // possible for the adds of the offsets to the split parts to overflow. - return getMemBasePlusOffset(Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap); + return getMemBasePlusOffset( + Ptr, Offset, SL, SDNodeFlags::NoUnsignedWrap | SDNodeFlags::InBounds); } /// Return a new CALLSEQ_START node, that starts new call frame, in which diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a54857e1037e2..63e012c04fb59 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1201,9 +1201,12 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, if (DAG.isConstantIntBuildVectorOrConstantInt(N01)) { SDNodeFlags NewFlags; - if (N0.getOpcode() == ISD::ADD && N0->getFlags().hasNoUnsignedWrap() && - Flags.hasNoUnsignedWrap()) - NewFlags |= SDNodeFlags::NoUnsignedWrap; + if (N0.getOpcode() == ISD::ADD) { + if (N0->getFlags().hasNoUnsignedWrap() && Flags.hasNoUnsignedWrap()) + NewFlags |= SDNodeFlags::NoUnsignedWrap; + if (N0->getFlags().hasInBounds() && Flags.hasInBounds()) + NewFlags |= SDNodeFlags::InBounds; + } if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d1f92c9ef00e9..a86ad2acecd2c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8196,7 +8196,7 @@ static SDValue getMemcpyLoadsAndStores( if (Value.getNode()) { Store = DAG.getStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo); OutChains.push_back(Store); } @@ -8221,14 +8221,14 @@ static SDValue getMemcpyLoadsAndStores( Value = DAG.getExtLoad( ISD::EXTLOAD, dl, NVT, Chain, - DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl), + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), SrcPtrInfo.getWithOffset(SrcOff), VT, commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo); OutLoadChains.push_back(Value.getValue(1)); Store = DAG.getTruncStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo); OutStoreChains.push_back(Store); } @@ -8365,7 +8365,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, Value = DAG.getLoad( VT, dl, Chain, - DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl), + DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo); LoadValues.push_back(Value); LoadChains.push_back(Value.getValue(1)); @@ -8380,7 +8380,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, Store = DAG.getStore( Chain, dl, LoadValues[i], - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo); OutChains.push_back(Store); DstOff += VTSize; @@ -8512,7 +8512,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl, assert(Value.getValueType() == VT && "Value with wrong type."); SDValue Store = DAG.getStore( Chain, dl, Value, - DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl), + DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), DstPtrInfo.getWithOffset(DstOff), Alignment, isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone, NewAAInfo); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 536bf0c208752..62c009d06a4de 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1744,72 +1744,82 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr, isFlatScratchBaseLegal(Addr))) { int64_t COffsetVal = cast(N1)->getSExtValue(); - const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { - Addr = N0; - OffsetVal = COffsetVal; - } else { - // If the offset doesn't fit, put the low bits into the offset field and - // add the rest. - // - // For a FLAT instruction the hardware decides whether to access - // global/scratch/shared memory based on the high bits of vaddr, - // ignoring the offset field, so we have to ensure that when we add - // remainder to vaddr it still points into the same underlying object. - // The easiest way to do that is to make sure that we split the offset - // into two pieces that are both >= 0 or both <= 0. - - SDLoc DL(N); - uint64_t RemainderOffset; - - std::tie(OffsetVal, RemainderOffset) = - TII->splitFlatOffset(COffsetVal, AS, FlatVariant); - - SDValue AddOffsetLo = - getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); - SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); - - if (Addr.getValueType().getSizeInBits() == 32) { - SmallVector Opnds; - Opnds.push_back(N0); - Opnds.push_back(AddOffsetLo); - unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; - if (Subtarget->hasAddNoCarry()) { - AddOp = AMDGPU::V_ADD_U32_e64; - Opnds.push_back(Clamp); - } - Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + // Adding the offset to the base address in a FLAT instruction must not + // change the memory aperture in which the address falls. Therefore we can + // only fold offsets from inbounds GEPs into FLAT instructions. + bool IsInBounds = Addr->getFlags().hasInBounds(); + if (COffsetVal == 0 || FlatVariant != SIInstrFlags::FLAT || IsInBounds) { + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) { + Addr = N0; + OffsetVal = COffsetVal; } else { - // TODO: Should this try to use a scalar add pseudo if the base address - // is uniform and saddr is usable? - SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); - SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); - - SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub0); - SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - DL, MVT::i32, N0, Sub1); - - SDValue AddOffsetHi = - getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); - - SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); - - SDNode *Add = - CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, - {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); - - SDNode *Addc = CurDAG->getMachineNode( - AMDGPU::V_ADDC_U32_e64, DL, VTs, - {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); - - SDValue RegSequenceArgs[] = { - CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), - SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; - - Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::i64, RegSequenceArgs), - 0); + // If the offset doesn't fit, put the low bits into the offset field + // and add the rest. + // + // For a FLAT instruction the hardware decides whether to access + // global/scratch/shared memory based on the high bits of vaddr, + // ignoring the offset field, so we have to ensure that when we add + // remainder to vaddr it still points into the same underlying object. + // The easiest way to do that is to make sure that we split the offset + // into two pieces that are both >= 0 or both <= 0. + + SDLoc DL(N); + uint64_t RemainderOffset; + + std::tie(OffsetVal, RemainderOffset) = + TII->splitFlatOffset(COffsetVal, AS, FlatVariant); + + SDValue AddOffsetLo = + getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + if (Addr.getValueType().getSizeInBits() == 32) { + SmallVector Opnds; + Opnds.push_back(N0); + Opnds.push_back(AddOffsetLo); + unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32; + if (Subtarget->hasAddNoCarry()) { + AddOp = AMDGPU::V_ADD_U32_e64; + Opnds.push_back(Clamp); + } + Addr = + SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0); + } else { + // TODO: Should this try to use a scalar add pseudo if the base + // address is uniform and saddr is usable? + SDValue Sub0 = + CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = + CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetHi = + getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + + SDNode *Add = + CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, + MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1}; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), + 0); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index e74fd21365c9d..90ef9a7a45863 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %addr, i32 -4 + %gep = getelementptr inbounds i32, ptr %addr, i32 -4 %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) ret void } @@ -49,7 +49,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %addr, i32 -4 + %gep = getelementptr inbounds i32, ptr %addr, i32 -4 %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) ret void } @@ -83,7 +83,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr ; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %addr, i32 4 + %gep = getelementptr inbounds i32, ptr %addr, i32 4 %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in) store i32 %val, ptr %use ret void diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll index 3305cac0d7ea6..9b57bc2f74df0 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll @@ -12,8 +12,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-GFX7-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX7: if: @@ -28,8 +28,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX8-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX8: if: @@ -44,11 +44,11 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX9-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 28 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4 ; OPT-GFX9-NEXT: br label [[ENDIF]] ; OPT-GFX9: endif: @@ -58,11 +58,11 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; ; OPT-GFX10-LABEL: @test_sinkable_flat_small_offset_i32( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX10: if: -; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 28 +; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 28 ; OPT-GFX10-NEXT: [[LOAD:%.*]] = load i32, ptr [[SUNKADDR]], align 4 ; OPT-GFX10-NEXT: br label [[ENDIF]] ; OPT-GFX10: endif: @@ -146,8 +146,8 @@ define void @test_sinkable_flat_small_offset_i32(ptr %out, ptr %in, i32 %cond) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -167,12 +167,12 @@ done: define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-GFX7-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX7: if: ; OPT-GFX7-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX7-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX7-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX7-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX7-NEXT: br label [[ENDIF]] ; OPT-GFX7: endif: @@ -182,8 +182,8 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX8-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr [[IN:%.*]], i64 7 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 7 ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX8: if: @@ -197,12 +197,12 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX9-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: ; OPT-GFX9-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX9-NEXT: br label [[ENDIF]] ; OPT-GFX9: endif: @@ -212,12 +212,12 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; ; OPT-GFX10-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX10: if: ; OPT-GFX10-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(1) -; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP0]], i64 28 +; OPT-GFX10-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 28 ; OPT-GFX10-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) [[SUNKADDR]], align 4 ; OPT-GFX10-NEXT: br label [[ENDIF]] ; OPT-GFX10: endif: @@ -303,8 +303,8 @@ define void @test_sink_noop_addrspacecast_flat_to_global_i32(ptr %out, ptr %in, ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cast = addrspacecast ptr %in.gep to ptr addrspace(1) %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -325,12 +325,12 @@ done: define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in, i32 %cond) { ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 999999 +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 999999 ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[COND:%.*]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT: if: ; OPT-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[IN:%.*]] to ptr addrspace(4) -; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i64 28 +; OPT-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 28 ; OPT-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(4) [[SUNKADDR]], align 4 ; OPT-NEXT: br label [[ENDIF]] ; OPT: endif: @@ -416,8 +416,8 @@ define void @test_sink_noop_addrspacecast_flat_to_constant_i32(ptr %out, ptr %in ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 999999 - %in.gep = getelementptr i32, ptr %in, i64 7 + %out.gep = getelementptr inbounds i32, ptr %out, i64 999999 + %in.gep = getelementptr inbounds i32, ptr %in, i64 7 %cast = addrspacecast ptr %in.gep to ptr addrspace(4) %cmp0 = icmp eq i32 %cond, 0 br i1 %cmp0, label %endif, label %if @@ -438,8 +438,8 @@ done: define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; OPT-GFX7-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX7-NEXT: entry: -; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX7-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX7-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX7-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX7-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX7-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -456,8 +456,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX8-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX8-NEXT: entry: -; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX8-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX8-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX8-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX8-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX8-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -474,12 +474,12 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX9-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX9-NEXT: entry: -; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX9-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 ; OPT-GFX9-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX9-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX9-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] ; OPT-GFX9: if: -; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX9-NEXT: [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX9-NEXT: [[LOAD:%.*]] = load i8, ptr [[SUNKADDR]], align 1 ; OPT-GFX9-NEXT: [[CAST:%.*]] = sext i8 [[LOAD]] to i32 ; OPT-GFX9-NEXT: br label [[ENDIF]] @@ -490,8 +490,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; ; OPT-GFX10-LABEL: @test_sink_flat_small_max_flat_offset( ; OPT-GFX10-NEXT: entry: -; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-GFX10-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4095 +; OPT-GFX10-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-GFX10-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4095 ; OPT-GFX10-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-GFX10-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-GFX10-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -588,8 +588,8 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i32 1024 - %in.gep = getelementptr i8, ptr %in, i64 4095 + %out.gep = getelementptr inbounds i32, ptr %out, i32 1024 + %in.gep = getelementptr inbounds i8, ptr %in, i64 4095 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if @@ -611,8 +611,8 @@ done: define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i64 99999 -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 4096 +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i64 99999 +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 4096 ; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]] ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -711,8 +711,8 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i64 99999 - %in.gep = getelementptr i8, ptr %in, i64 4096 + %out.gep = getelementptr inbounds i32, ptr %out, i64 99999 + %in.gep = getelementptr inbounds i8, ptr %in, i64 4096 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if @@ -734,8 +734,8 @@ done: define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; OPT-LABEL: @test_sinkable_flat_reg_offset( ; OPT-NEXT: entry: -; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr i32, ptr [[OUT:%.*]], i32 1024 -; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i8, ptr [[IN:%.*]], i64 [[REG:%.*]] +; OPT-NEXT: [[OUT_GEP:%.*]] = getelementptr inbounds i32, ptr [[OUT:%.*]], i32 1024 +; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr inbounds i8, ptr [[IN:%.*]], i64 [[REG:%.*]] ; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3]] ; OPT-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TID]], 0 ; OPT-NEXT: br i1 [[CMP0]], label [[ENDIF:%.*]], label [[IF:%.*]] @@ -834,8 +834,8 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: - %out.gep = getelementptr i32, ptr %out, i32 1024 - %in.gep = getelementptr i8, ptr %in, i64 %reg + %out.gep = getelementptr inbounds i32, ptr %out, i32 1024 + %in.gep = getelementptr inbounds i8, ptr %in, i64 %reg %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %cmp0 = icmp eq i32 %tid, 0 br i1 %cmp0, label %endif, label %if diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index c713c48c92457..4a6b1843de3b6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -369,7 +369,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -563,7 +563,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -986,7 +986,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1208,7 +1208,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1397,7 +1397,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -1617,7 +1617,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -1781,7 +1781,7 @@ define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1932,7 +1932,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2149,7 +2149,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -2313,7 +2313,7 @@ define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0 ret void } @@ -2679,7 +2679,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2873,7 +2873,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -3296,7 +3296,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3518,7 +3518,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -3707,7 +3707,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -3927,7 +3927,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -4116,7 +4116,7 @@ define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memor ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret float %result } @@ -4336,7 +4336,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret void } @@ -10060,7 +10060,7 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -10282,7 +10282,7 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -12900,7 +12900,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -13177,7 +13177,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14592,7 +14592,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -14796,7 +14796,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -15161,7 +15161,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15362,7 +15362,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -15555,7 +15555,7 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -15744,7 +15744,7 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -17033,7 +17033,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -17325,7 +17325,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -17866,7 +17866,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -18155,7 +18155,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -18436,7 +18436,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -18713,7 +18713,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 71abe6f32e81e..740fd47a72061 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -297,7 +297,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -460,7 +460,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -747,7 +747,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -910,7 +910,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1061,7 +1061,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -1211,7 +1211,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1839,7 +1839,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2002,7 +2002,7 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2289,7 +2289,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2452,7 +2452,7 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2603,7 +2603,7 @@ define float @flat_system_atomic_fmax_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2753,7 +2753,7 @@ define void @flat_system_atomic_fmax_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8054,7 +8054,7 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -8284,7 +8284,7 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11270,7 +11270,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -11548,7 +11548,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12730,7 +12730,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -12986,7 +12986,7 @@ define <2 x half> @flat_agent_atomic_fmax_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -13445,7 +13445,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13696,7 +13696,7 @@ define void @flat_agent_atomic_fmax_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13937,7 +13937,7 @@ define <2 x half> @flat_system_atomic_fmax_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -14173,7 +14173,7 @@ define void @flat_system_atomic_fmax_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14872,7 +14872,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -15239,7 +15239,7 @@ define <2 x bfloat> @flat_agent_atomic_fmax_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -15916,7 +15916,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16276,7 +16276,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16628,7 +16628,7 @@ define <2 x bfloat> @flat_system_atomic_fmax_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -16973,7 +16973,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmax ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 49c4b9000d8b5..77cdb720c21ae 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -297,7 +297,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -460,7 +460,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__amdgpu_no_fine_grai ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -747,7 +747,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -910,7 +910,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1061,7 +1061,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -1211,7 +1211,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -1839,7 +1839,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2002,7 +2002,7 @@ define float @flat_agent_atomic_fmin_ret_f32__offset12b_neg__ftz__amdgpu_no_fine ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2289,7 +2289,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2452,7 +2452,7 @@ define void @flat_agent_atomic_fmin_noret_f32__offset12b_neg__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -2603,7 +2603,7 @@ define float @flat_system_atomic_fmin_ret_f32__offset12b_pos__ftz__amdgpu_no_fin ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret float %result } @@ -2753,7 +2753,7 @@ define void @flat_system_atomic_fmin_noret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -8054,7 +8054,7 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret half %result } @@ -8284,7 +8284,7 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -11270,7 +11270,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret bfloat %result } @@ -11548,7 +11548,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0 ret void } @@ -12730,7 +12730,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -12986,7 +12986,7 @@ define <2 x half> @flat_agent_atomic_fmin_ret_v2f16__offset12b_neg__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -13445,7 +13445,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13696,7 +13696,7 @@ define void @flat_agent_atomic_fmin_noret_v2f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -13937,7 +13937,7 @@ define <2 x half> @flat_system_atomic_fmin_ret_v2f16__offset12b_pos__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } @@ -14173,7 +14173,7 @@ define void @flat_system_atomic_fmin_noret_v2f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -14872,7 +14872,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -15239,7 +15239,7 @@ define <2 x bfloat> @flat_agent_atomic_fmin_ret_v2bf16__offset12b_neg__amdgpu_no ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -15916,7 +15916,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16276,7 +16276,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } @@ -16628,7 +16628,7 @@ define <2 x bfloat> @flat_system_atomic_fmin_ret_v2bf16__offset12b_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } @@ -16973,7 +16973,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fmin ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index a6f8880d6d6f8..c0c5ff0c96c75 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -401,7 +401,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -618,7 +618,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg(ptr %ptr, float %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -995,7 +995,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1205,7 +1205,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -1407,7 +1407,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos(ptr %ptr, float %va ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst ret float %result } @@ -1602,7 +1602,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos(ptr %ptr, float %v ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst ret void } @@ -2000,7 +2000,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, float ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -2217,7 +2217,7 @@ define float @flat_agent_atomic_fsub_ret_f32__offset12b_neg__ftz(ptr %ptr, float ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret float %result } @@ -2594,7 +2594,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -2804,7 +2804,7 @@ define void @flat_agent_atomic_fsub_noret_f32__offset12b_neg__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 -512 + %gep = getelementptr inbounds float, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, float %val syncscope("agent") seq_cst ret void } @@ -3006,7 +3006,7 @@ define float @flat_system_atomic_fsub_ret_f32__offset12b_pos__ftz(ptr %ptr, floa ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, float %val seq_cst ret float %result } @@ -3201,7 +3201,7 @@ define void @flat_system_atomic_fsub_noret_f32__offset12b_pos__ftz(ptr %ptr, flo ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %ptr, i64 511 + %gep = getelementptr inbounds float, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, float %val seq_cst ret void } @@ -7547,7 +7547,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 -1024 + %gep = getelementptr inbounds half, ptr %ptr, i64 -1024 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst ret void } @@ -7769,7 +7769,7 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 ret half %result } @@ -7983,7 +7983,7 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr half, ptr %ptr, i64 1023 + %gep = getelementptr inbounds half, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4 ret void } @@ -10932,7 +10932,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret bfloat %result } @@ -11209,7 +11209,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr bfloat, ptr %ptr, i64 1023 + %gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023 %unused = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4 ret void } @@ -12355,7 +12355,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -12594,7 +12594,7 @@ define <2 x half> @flat_agent_atomic_fsub_ret_v2f16__offset12b_neg(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result } @@ -13015,7 +13015,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret void } @@ -13247,7 +13247,7 @@ define void @flat_agent_atomic_fsub_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x half> %val syncscope("agent") seq_cst ret void } @@ -13471,7 +13471,7 @@ define <2 x half> @flat_system_atomic_fsub_ret_v2f16__offset12b_pos(ptr %ptr, <2 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst ret <2 x half> %result } @@ -13688,7 +13688,7 @@ define void @flat_system_atomic_fsub_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x half>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x half>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x half> %val seq_cst ret void } @@ -14387,7 +14387,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -14754,7 +14754,7 @@ define <2 x bfloat> @flat_agent_atomic_fsub_ret_v2bf16__offset12b_neg(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result } @@ -15431,7 +15431,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -15791,7 +15791,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 -512 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst ret void } @@ -16143,7 +16143,7 @@ define <2 x bfloat> @flat_system_atomic_fsub_ret_v2bf16__offset12b_pos(ptr %ptr, ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %result = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result } @@ -16488,7 +16488,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511 + %gep = getelementptr inbounds <2 x bfloat>, ptr %ptr, i64 511 %unused = atomicrmw fsub ptr %gep, <2 x bfloat> %val seq_cst ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e674b57aae3ef..7d29d8d395d1d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -47,7 +47,7 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -96,7 +96,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -147,7 +147,7 @@ define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -205,7 +205,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -267,8 +267,8 @@ define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -338,8 +338,8 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -495,7 +495,7 @@ define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -561,7 +561,7 @@ define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -611,7 +611,7 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -669,7 +669,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -731,8 +731,8 @@ define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -802,8 +802,8 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -959,7 +959,7 @@ define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -1025,7 +1025,7 @@ define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -1075,7 +1075,7 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -1133,7 +1133,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -1195,8 +1195,8 @@ define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -1266,8 +1266,8 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -1423,7 +1423,7 @@ define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -1489,7 +1489,7 @@ define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -1536,7 +1536,7 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -1594,7 +1594,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -1653,8 +1653,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -1724,8 +1724,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -1875,7 +1875,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst ret void } @@ -1941,7 +1941,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -1988,7 +1988,7 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2046,7 +2046,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2105,8 +2105,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2176,8 +2176,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2327,7 +2327,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2393,7 +2393,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2440,7 +2440,7 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2498,7 +2498,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2557,8 +2557,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2628,8 +2628,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2779,7 +2779,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2845,7 +2845,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -2892,7 +2892,7 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -2950,7 +2950,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -3009,8 +3009,8 @@ define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst ret void } @@ -3080,8 +3080,8 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -3231,7 +3231,7 @@ define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: s_waitcnt lgkmcnt(0) ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst ret void } @@ -3297,7 +3297,7 @@ define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst store i32 %val, ptr %out2 ret void @@ -3347,7 +3347,7 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -3405,7 +3405,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -3467,8 +3467,8 @@ define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -3538,8 +3538,8 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -3695,7 +3695,7 @@ define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -3761,7 +3761,7 @@ define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -3811,7 +3811,7 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -3860,7 +3860,7 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst ret void } @@ -3918,7 +3918,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -3980,8 +3980,8 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -4051,8 +4051,8 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -4208,7 +4208,7 @@ define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -4274,7 +4274,7 @@ define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -4326,7 +4326,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -4387,7 +4387,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -4456,8 +4456,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -4533,8 +4533,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %o ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -4701,7 +4701,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %ind ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -4773,7 +4773,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i3 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %flag = extractvalue { i32, i1 } %val, 0 store i32 %flag, ptr %out2 @@ -4824,7 +4824,7 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -4882,7 +4882,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -4944,8 +4944,8 @@ define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -5015,8 +5015,8 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -5172,7 +5172,7 @@ define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -5238,7 +5238,7 @@ define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -5291,7 +5291,7 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) { ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %in, i32 4 + %gep = getelementptr inbounds i32, ptr %in, i32 4 %val = load atomic i32, ptr %gep seq_cst, align 4 store i32 %val, ptr %out ret void @@ -5404,8 +5404,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %in, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %in, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = load atomic i32, ptr %gep seq_cst, align 4 store i32 %val, ptr %out ret void @@ -5466,7 +5466,7 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %in, i64 %index + %ptr = getelementptr inbounds i32, ptr %in, i64 %index %val = load atomic i32, ptr %ptr seq_cst, align 4 store i32 %val, ptr %out ret void @@ -5510,7 +5510,7 @@ define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) { ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 store atomic i32 %in, ptr %gep seq_cst, align 4 ret void } @@ -5600,8 +5600,8 @@ define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 store atomic i32 %in, ptr %gep seq_cst, align 4 ret void } @@ -5649,7 +5649,7 @@ define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index store atomic i32 %in, ptr %ptr seq_cst, align 4 ret void } @@ -5701,7 +5701,7 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) { ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %in, i32 4 + %gep = getelementptr inbounds float, ptr %in, i32 4 %val = load atomic float, ptr %gep seq_cst, align 4 store float %val, ptr %out ret void @@ -5814,8 +5814,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %in, i64 %index - %gep = getelementptr float, ptr %ptr, i32 4 + %ptr = getelementptr inbounds float, ptr %in, i64 %index + %gep = getelementptr inbounds float, ptr %ptr, i32 4 %val = load atomic float, ptr %gep seq_cst, align 4 store float %val, ptr %out ret void @@ -5876,7 +5876,7 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %in, i64 %index + %ptr = getelementptr inbounds float, ptr %in, i64 %index %val = load atomic float, ptr %ptr seq_cst, align 4 store float %val, ptr %out ret void @@ -5920,7 +5920,7 @@ define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) { ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 store atomic float %in, ptr %gep seq_cst, align 4 ret void } @@ -6010,8 +6010,8 @@ define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %out, i64 %index - %gep = getelementptr float, ptr %ptr, i32 4 + %ptr = getelementptr inbounds float, ptr %out, i64 %index + %gep = getelementptr inbounds float, ptr %ptr, i32 4 store atomic float %in, ptr %gep seq_cst, align 4 ret void } @@ -6059,7 +6059,7 @@ define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %ind ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr float, ptr %out, i64 %index + %ptr = getelementptr inbounds float, ptr %out, i64 %index store atomic float %in, ptr %ptr seq_cst, align 4 ret void } @@ -6111,7 +6111,7 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) { ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i8, ptr %in, i64 16 + %gep = getelementptr inbounds i8, ptr %in, i64 16 %val = load atomic i8, ptr %gep seq_cst, align 1 store i8 %val, ptr %out ret void @@ -6221,8 +6221,8 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 % ; GCN3-NEXT: flat_store_byte v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i8, ptr %in, i64 %index - %gep = getelementptr i8, ptr %ptr, i64 16 + %ptr = getelementptr inbounds i8, ptr %in, i64 %index + %gep = getelementptr inbounds i8, ptr %ptr, i64 16 %val = load atomic i8, ptr %gep seq_cst, align 1 store i8 %val, ptr %out ret void @@ -6266,7 +6266,7 @@ define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) { ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i8, ptr %out, i64 16 + %gep = getelementptr inbounds i8, ptr %out, i64 16 store atomic i8 %in, ptr %gep seq_cst, align 1 ret void } @@ -6353,8 +6353,8 @@ define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 % ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i8, ptr %out, i64 %index - %gep = getelementptr i8, ptr %ptr, i64 16 + %ptr = getelementptr inbounds i8, ptr %out, i64 %index + %gep = getelementptr inbounds i8, ptr %ptr, i64 16 store atomic i8 %in, ptr %gep seq_cst, align 1 ret void } @@ -6406,7 +6406,7 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i16, ptr %in, i64 8 + %gep = getelementptr inbounds i16, ptr %in, i64 8 %val = load atomic i16, ptr %gep seq_cst, align 2 store i16 %val, ptr %out ret void @@ -6519,8 +6519,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i16, ptr %in, i64 %index - %gep = getelementptr i16, ptr %ptr, i64 8 + %ptr = getelementptr inbounds i16, ptr %in, i64 %index + %gep = getelementptr inbounds i16, ptr %ptr, i64 8 %val = load atomic i16, ptr %gep seq_cst, align 2 store i16 %val, ptr %out ret void @@ -6564,7 +6564,7 @@ define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) { ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i16, ptr %out, i64 8 + %gep = getelementptr inbounds i16, ptr %out, i64 8 store atomic i16 %in, ptr %gep seq_cst, align 2 ret void } @@ -6654,8 +6654,8 @@ define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i16, ptr %out, i64 %index - %gep = getelementptr i16, ptr %ptr, i64 8 + %ptr = getelementptr inbounds i16, ptr %out, i64 %index + %gep = getelementptr inbounds i16, ptr %ptr, i64 8 store atomic i16 %in, ptr %gep seq_cst, align 2 ret void } @@ -6698,7 +6698,7 @@ define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) { ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr half, ptr %out, i64 8 + %gep = getelementptr inbounds half, ptr %out, i64 8 store atomic half %in, ptr %gep seq_cst, align 2 ret void } @@ -6774,7 +6774,7 @@ define amdgpu_kernel void @atomic_store_bf16_offset(bfloat %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v2, s2 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm - %gep = getelementptr bfloat, ptr %out, i64 8 + %gep = getelementptr inbounds bfloat, ptr %out, i64 8 store atomic bfloat %in, ptr %out seq_cst, align 2 ret void } @@ -6860,7 +6860,7 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -6909,7 +6909,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -6960,7 +6960,7 @@ define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7018,7 +7018,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7080,8 +7080,8 @@ define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7151,8 +7151,8 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7308,7 +7308,7 @@ define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -7374,7 +7374,7 @@ define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7424,7 +7424,7 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7473,7 +7473,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1023 + %gep = getelementptr inbounds i32, ptr %out, i32 1023 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7524,7 +7524,7 @@ define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 1024 + %gep = getelementptr inbounds i32, ptr %out, i32 1024 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7582,7 +7582,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7644,8 +7644,8 @@ define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 % ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst ret void } @@ -7715,8 +7715,8 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i64 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7872,7 +7872,7 @@ define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst ret void } @@ -7938,7 +7938,7 @@ define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i64 %index + %ptr = getelementptr inbounds i32, ptr %out, i64 %index %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst store i32 %val, ptr %out2 ret void @@ -7990,7 +7990,7 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm - %gep = getelementptr half, ptr %in, i64 8 + %gep = getelementptr inbounds half, ptr %in, i64 8 %val = load atomic half, ptr %gep seq_cst, align 2 store half %val, ptr %out ret void @@ -8089,7 +8089,7 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) { ; GCN3-NEXT: v_mov_b32_e32 v1, s3 ; GCN3-NEXT: flat_store_short v[0:1], v2 ; GCN3-NEXT: s_endpgm - %gep = getelementptr bfloat, ptr %in, i64 8 + %gep = getelementptr inbounds bfloat, ptr %in, i64 8 %val = load atomic bfloat, ptr %gep seq_cst, align 2 store bfloat %val, ptr %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715ddd..0e84c7295d29b 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -63,7 +63,7 @@ define void @flat_atomic_xchg_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst ret void } @@ -124,7 +124,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst ret i32 %result } @@ -203,7 +203,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst ret void } @@ -282,7 +282,7 @@ define amdgpu_gfx i32 @flat_atomic_xchg_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst ret i32 %result } @@ -315,7 +315,7 @@ define void @flat_atomic_xchg_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -348,7 +348,7 @@ define i32 @flat_atomic_xchg_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -413,7 +413,7 @@ define void @flat_atomic_xchg_f32_noret_offset(ptr %out, float %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst ret void } @@ -474,7 +474,7 @@ define float @flat_atomic_xchg_f32_ret_offset(ptr %out, float %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst ret float %result } @@ -553,7 +553,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst ret void } @@ -632,7 +632,7 @@ define amdgpu_gfx float @flat_atomic_xchg_f32_ret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i32 4 + %gep = getelementptr inbounds float, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst ret float %result } @@ -665,7 +665,7 @@ define void @flat_atomic_xchg_f32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i64 4 + %gep = getelementptr inbounds float, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -698,7 +698,7 @@ define float @flat_atomic_xchg_f32_ret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr float, ptr %out, i64 4 + %gep = getelementptr inbounds float, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, float %in seq_cst, !amdgpu.no.remote.memory !0 ret float %result } @@ -763,7 +763,7 @@ define void @flat_atomic_add_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst ret void } @@ -824,7 +824,7 @@ define i32 @flat_atomic_add_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst ret i32 %result } @@ -903,7 +903,7 @@ define amdgpu_gfx void @flat_atomic_add_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst ret void } @@ -982,7 +982,7 @@ define amdgpu_gfx i32 @flat_atomic_add_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1015,7 +1015,7 @@ define void @flat_atomic_add_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1048,7 +1048,7 @@ define i32 @flat_atomic_add_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -1113,7 +1113,7 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst ret void } @@ -1174,7 +1174,7 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1253,7 +1253,7 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst ret void } @@ -1332,7 +1332,7 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1365,7 +1365,7 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1398,7 +1398,7 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -1463,7 +1463,7 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst ret void } @@ -1524,7 +1524,7 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1603,7 +1603,7 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst ret void } @@ -1682,7 +1682,7 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst ret i32 %result } @@ -1715,7 +1715,7 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -1748,7 +1748,7 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -1897,7 +1897,7 @@ define void @flat_atomic_nand_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst ret void } @@ -2046,7 +2046,7 @@ define i32 @flat_atomic_nand_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2203,7 +2203,7 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst ret void } @@ -2368,7 +2368,7 @@ define amdgpu_gfx i32 @flat_atomic_nand_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2443,7 +2443,7 @@ define void @flat_atomic_nand_i32_noret_offset__amdgpu_no_remote_memory(ptr %out ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2519,7 +2519,7 @@ define i32 @flat_atomic_nand_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -2584,7 +2584,7 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst ret void } @@ -2645,7 +2645,7 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2724,7 +2724,7 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst ret void } @@ -2803,7 +2803,7 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst ret i32 %result } @@ -2836,7 +2836,7 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -2869,7 +2869,7 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -2934,7 +2934,7 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst ret void } @@ -2995,7 +2995,7 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3074,7 +3074,7 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst ret void } @@ -3153,7 +3153,7 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3186,7 +3186,7 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -3219,7 +3219,7 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -3362,7 +3362,7 @@ define void @flat_atomic_max_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -3505,7 +3505,7 @@ define i32 @flat_atomic_max_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3656,7 +3656,7 @@ define amdgpu_gfx void @flat_atomic_max_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -3815,7 +3815,7 @@ define amdgpu_gfx i32 @flat_atomic_max_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst ret i32 %result } @@ -3909,8 +3909,8 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst ret void } @@ -4019,8 +4019,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -4111,7 +4111,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst ret void } @@ -4216,7 +4216,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw max ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -4289,7 +4289,7 @@ define void @flat_max_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -4362,7 +4362,7 @@ define i32 @flat_atomic_max_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -4505,7 +4505,7 @@ define void @flat_atomic_umax_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -4648,7 +4648,7 @@ define i32 @flat_atomic_umax_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst ret i32 %result } @@ -4799,7 +4799,7 @@ define amdgpu_gfx void @flat_atomic_umax_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -4958,7 +4958,7 @@ define amdgpu_gfx i32 @flat_atomic_umax_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst ret i32 %result } @@ -5052,8 +5052,8 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst ret void } @@ -5162,8 +5162,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2 ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -5269,7 +5269,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 % ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw umax ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -5342,7 +5342,7 @@ define void @flat_umax_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -5415,7 +5415,7 @@ define i32 @flat_atomic_umax_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -5558,7 +5558,7 @@ define void @flat_atomic_umin_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst ret void } @@ -5701,7 +5701,7 @@ define i32 @flat_atomic_umin_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst ret i32 %result } @@ -5852,7 +5852,7 @@ define amdgpu_gfx void @flat_atomic_umin_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst ret void } @@ -6011,7 +6011,7 @@ define amdgpu_gfx i32 @flat_atomic_umin_i32_ret_offset_scalar(ptr inreg %out, i3 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst ret i32 %result } @@ -6083,7 +6083,7 @@ define void @flat_umin_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -6156,7 +6156,7 @@ define i32 @flat_atomic_umin_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -6299,7 +6299,7 @@ define void @flat_atomic_min_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -6442,7 +6442,7 @@ define i32 @flat_atomic_min_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst ret i32 %result } @@ -6593,7 +6593,7 @@ define amdgpu_gfx void @flat_atomic_min_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -6752,7 +6752,7 @@ define amdgpu_gfx i32 @flat_atomic_min_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst ret i32 %result } @@ -6846,8 +6846,8 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst ret void } @@ -6956,8 +6956,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index - %gep = getelementptr i32, ptr %ptr, i32 4 + %ptr = getelementptr inbounds i32, ptr %out, i32 %index + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -7140,7 +7140,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i ; GCN3-NEXT: flat_store_dword v[0:1], v2 ; GCN3-NEXT: s_endpgm entry: - %ptr = getelementptr i32, ptr %out, i32 %index + %ptr = getelementptr inbounds i32, ptr %out, i32 %index %tmp0 = atomicrmw min ptr %ptr, i32 %in seq_cst store i32 %tmp0, ptr %out2 ret void @@ -7213,7 +7213,7 @@ define void @flat_min_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7286,7 +7286,7 @@ define i32 @flat_atomic_min_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -7351,7 +7351,7 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret void } @@ -7412,7 +7412,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7491,7 +7491,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret void } @@ -7570,7 +7570,7 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7603,7 +7603,7 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7636,7 +7636,7 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } @@ -7701,7 +7701,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret void } @@ -7762,7 +7762,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7841,7 +7841,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret void } @@ -7920,7 +7920,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst ret i32 %result } @@ -7953,7 +7953,7 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret void } @@ -7986,7 +7986,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i32, ptr %out, i64 4 + %gep = getelementptr inbounds i32, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 ret i32 %result } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index 07c9521e7646a..6ffb18dfd4d54 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -11794,7 +11794,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %in, i64 4 + %gep = getelementptr inbounds i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -11904,8 +11904,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -11965,7 +11965,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index + %ptr = getelementptr inbounds i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -12008,7 +12008,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -12100,8 +12100,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -13577,7 +13577,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %in, i64 4 + %gep = getelementptr inbounds double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8 store double %val, ptr %out ret void @@ -13687,8 +13687,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %in, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8 store double %val, ptr %out ret void @@ -13791,7 +13791,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8 ret void } @@ -13883,8 +13883,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %out, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll index 8991a062f37a4..107ee163a1e15 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -104,7 +104,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -165,8 +165,8 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -230,8 +230,8 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -385,7 +385,7 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -445,7 +445,7 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -493,7 +493,7 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -552,7 +552,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -613,8 +613,8 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -678,8 +678,8 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -833,7 +833,7 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -893,7 +893,7 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -941,7 +941,7 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1000,7 +1000,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1061,8 +1061,8 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1126,8 +1126,8 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1281,7 +1281,7 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -1341,7 +1341,7 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1387,7 +1387,7 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1446,7 +1446,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1505,8 +1505,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1570,8 +1570,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1721,7 +1721,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1781,7 +1781,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1827,7 +1827,7 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -1886,7 +1886,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -1945,8 +1945,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2010,8 +2010,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2161,7 +2161,7 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2221,7 +2221,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2267,7 +2267,7 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2326,7 +2326,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2385,8 +2385,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2450,8 +2450,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2601,7 +2601,7 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2661,7 +2661,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2707,7 +2707,7 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2766,7 +2766,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -2825,8 +2825,8 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -2890,8 +2890,8 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3041,7 +3041,7 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_SE ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 ret void } @@ -3101,7 +3101,7 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3149,7 +3149,7 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3208,7 +3208,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3269,8 +3269,8 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3334,8 +3334,8 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3489,7 +3489,7 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3549,7 +3549,7 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3597,7 +3597,7 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3644,7 +3644,7 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3691,7 +3691,7 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr ptr, ptr %out, i32 4 + %gep = getelementptr inbounds ptr, ptr %out, i32 4 %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3750,7 +3750,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -3811,8 +3811,8 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -3876,8 +3876,8 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4031,7 +4031,7 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4091,7 +4091,7 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4139,7 +4139,7 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4198,7 +4198,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4259,8 +4259,8 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4324,8 +4324,8 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4479,7 +4479,7 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -4539,7 +4539,7 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -4590,7 +4590,7 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %in, i64 4 + %gep = getelementptr inbounds i64, ptr %in, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -4700,8 +4700,8 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %in, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = load atomic i64, ptr %gep seq_cst, align 8 store i64 %val, ptr %out ret void @@ -4761,7 +4761,7 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %in, i64 %index + %ptr = getelementptr inbounds i64, ptr %in, i64 %index %val = load atomic i64, ptr %ptr seq_cst, align 8 store i64 %val, ptr %out ret void @@ -4804,7 +4804,7 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -4896,8 +4896,8 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 store atomic i64 %in, ptr %gep seq_cst, align 8 ret void } @@ -4948,7 +4948,7 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index store atomic i64 %in, ptr %ptr seq_cst, align 8 ret void } @@ -5004,7 +5004,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5060,7 +5060,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 9000 + %gep = getelementptr inbounds i64, ptr %out, i64 9000 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5120,7 +5120,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -5182,8 +5182,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5255,8 +5255,8 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -5422,7 +5422,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 ret void } @@ -5490,7 +5490,7 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst, !noalias.addrspace !0 %extract0 = extractvalue { i64, i1 } %val, 0 store i64 %extract0, ptr %out2 @@ -5542,7 +5542,7 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %in, i64 4 + %gep = getelementptr inbounds double, ptr %in, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -5652,8 +5652,8 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %in, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 %val = load atomic double, ptr %gep seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -5713,7 +5713,7 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %in, i64 %index + %ptr = getelementptr inbounds double, ptr %in, i64 %index %val = load atomic double, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 store double %val, ptr %out ret void @@ -5756,7 +5756,7 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) { ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5848,8 +5848,8 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index - %gep = getelementptr double, ptr %ptr, i64 4 + %ptr = getelementptr inbounds double, ptr %out, i64 %index + %gep = getelementptr inbounds double, ptr %ptr, i64 4 store atomic double %in, ptr %gep seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5900,7 +5900,7 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr double, ptr %out, i64 %index + %ptr = getelementptr inbounds double, ptr %out, i64 %index store atomic double %in, ptr %ptr seq_cst, align 8, !noalias.addrspace !0 ret void } @@ -5947,7 +5947,7 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6006,7 +6006,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6067,8 +6067,8 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6132,8 +6132,8 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6287,7 +6287,7 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6347,7 +6347,7 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6395,7 +6395,7 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6454,7 +6454,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6515,8 +6515,8 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6580,8 +6580,8 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void @@ -6735,7 +6735,7 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: global_inv scope:SCOPE_DEV ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 ret void } @@ -6795,7 +6795,7 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0 store i64 %tmp0, ptr %out2 ret void diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf956..36fa4d4fe9018 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -63,7 +63,7 @@ define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -124,7 +124,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -209,7 +209,7 @@ define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -294,7 +294,7 @@ define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -327,7 +327,7 @@ define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -360,7 +360,7 @@ define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -425,7 +425,7 @@ define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -486,7 +486,7 @@ define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -571,7 +571,7 @@ define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret void } @@ -656,7 +656,7 @@ define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i32 4 + %gep = getelementptr inbounds double, ptr %out, i32 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1 ret double %result } @@ -689,7 +689,7 @@ define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -722,7 +722,7 @@ define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr double, ptr %out, i64 4 + %gep = getelementptr inbounds double, ptr %out, i64 4 %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret double %result } @@ -787,7 +787,7 @@ define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -848,7 +848,7 @@ define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -933,7 +933,7 @@ define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1018,7 +1018,7 @@ define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1051,7 +1051,7 @@ define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1084,7 +1084,7 @@ define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1149,7 +1149,7 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1210,7 +1210,7 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1295,7 +1295,7 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1380,7 +1380,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1413,7 +1413,7 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1446,7 +1446,7 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1511,7 +1511,7 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1572,7 +1572,7 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1657,7 +1657,7 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -1742,7 +1742,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -1775,7 +1775,7 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -1808,7 +1808,7 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -1987,7 +1987,7 @@ define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2170,7 +2170,7 @@ define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2373,7 +2373,7 @@ define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2576,7 +2576,7 @@ define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2666,7 +2666,7 @@ define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -2758,7 +2758,7 @@ define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -2823,7 +2823,7 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -2884,7 +2884,7 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -2969,7 +2969,7 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3054,7 +3054,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3087,7 +3087,7 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -3120,7 +3120,7 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -3185,7 +3185,7 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3246,7 +3246,7 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3331,7 +3331,7 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3416,7 +3416,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -3449,7 +3449,7 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -3482,7 +3482,7 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -3655,7 +3655,7 @@ define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -3832,7 +3832,7 @@ define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4041,7 +4041,7 @@ define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4250,7 +4250,7 @@ define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -4356,8 +4356,8 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4472,8 +4472,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -4576,7 +4576,7 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -4687,7 +4687,7 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -4775,7 +4775,7 @@ define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -4864,7 +4864,7 @@ define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -5037,7 +5037,7 @@ define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5214,7 +5214,7 @@ define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5423,7 +5423,7 @@ define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5632,7 +5632,7 @@ define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -5738,8 +5738,8 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -5854,8 +5854,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2 ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -5967,7 +5967,7 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 % ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -6055,7 +6055,7 @@ define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -6144,7 +6144,7 @@ define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -6317,7 +6317,7 @@ define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6494,7 +6494,7 @@ define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6703,7 +6703,7 @@ define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -6912,7 +6912,7 @@ define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i6 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -6999,7 +6999,7 @@ define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -7088,7 +7088,7 @@ define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -7261,7 +7261,7 @@ define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7438,7 +7438,7 @@ define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7647,7 +7647,7 @@ define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -7856,7 +7856,7 @@ define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -7962,8 +7962,8 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 % ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8078,8 +8078,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index - %gep = getelementptr i64, ptr %ptr, i64 4 + %ptr = getelementptr inbounds i64, ptr %out, i64 %index + %gep = getelementptr inbounds i64, ptr %ptr, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -8286,7 +8286,7 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i ; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm entry: - %ptr = getelementptr i64, ptr %out, i64 %index + %ptr = getelementptr inbounds i64, ptr %out, i64 %index %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1 store i64 %tmp0, ptr %out2 ret void @@ -8374,7 +8374,7 @@ define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -8463,7 +8463,7 @@ define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX9-NEXT: v_mov_b32_e32 v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -8528,7 +8528,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8589,7 +8589,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8674,7 +8674,7 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8759,7 +8759,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -8792,7 +8792,7 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -8825,7 +8825,7 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } @@ -8890,7 +8890,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -8951,7 +8951,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9036,7 +9036,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret void } @@ -9121,7 +9121,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result } @@ -9154,7 +9154,7 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret void } @@ -9187,7 +9187,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i64, ptr %out, i64 4 + %gep = getelementptr inbounds i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 ret i64 %result } diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 88cc4b1c96b4a..512dc21ade325 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -16,8 +16,8 @@ ; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) -; FIXME the offset here should not be folded: if %p points to the beginning of -; scratch or LDS and %i is -1, a folded offset crashes the program. +; The offset here cannot be folded: if %p points to the beginning of scratch or +; LDS and %i is -1, a folded offset crashes the program. define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX90A-LABEL: flat_offset_maybe_oob: ; GFX90A: ; %bb.0: @@ -26,7 +26,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 12, v0 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -37,7 +39,9 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -46,7 +50,8 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 12 +; GFX942-NEXT: flat_load_dword v0, v[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -57,9 +62,12 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -76,7 +84,10 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 12 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -157,3 +168,353 @@ define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { %l = load i32, ptr addrspace(5) %arrayidx ret i32 %l } + +; If the GEP that adds the offset is inbounds, folding the offset is legal. +define i32 @flat_offset_inbounds(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load i32, ptr %arrayidx + ret i32 %l +} + +define void @flat_offset_inbounds_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds_wide: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX90A-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds_wide: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX10-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX10-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds_wide: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v8, v[0:1] offset:28 +; GFX942-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_store_dword v[2:3], v8 offset:16 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds_wide: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: flat_load_b32 v8, v[0:1] offset:28 +; GFX11-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX11-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; GFX11-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds_wide: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: flat_load_b32 v8, v[0:1] offset:28 +; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x101 +; GFX12-NEXT: flat_store_b32 v[2:3], v8 offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x1 +; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7] +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <5 x i32>, ptr %arrayidx + store <5 x i32> %l, ptr %pout + ret void +} + +define void @flat_offset_inbounds_very_wide(ptr %p, ptr %pout, i32 %i) { +; GFX90A-LABEL: flat_offset_inbounds_very_wide: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX90A-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX90A-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX90A-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX90A-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX90A-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX90A-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX90A-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX90A-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX90A-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX90A-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX90A-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX90A-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_inbounds_very_wide: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX10-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x8 +; GFX10-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX10-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX10-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX10-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX10-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX10-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX10-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX10-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX10-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX10-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX10-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX10-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX10-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX10-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX10-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_inbounds_very_wide: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 2, v[0:1] +; GFX942-NEXT: flat_load_dwordx4 v[4:7], v[0:1] offset:108 +; GFX942-NEXT: flat_load_dwordx4 v[8:11], v[0:1] offset:124 +; GFX942-NEXT: flat_load_dwordx4 v[12:15], v[0:1] offset:76 +; GFX942-NEXT: flat_load_dwordx4 v[16:19], v[0:1] offset:92 +; GFX942-NEXT: flat_load_dwordx4 v[20:23], v[0:1] offset:44 +; GFX942-NEXT: flat_load_dwordx4 v[24:27], v[0:1] offset:60 +; GFX942-NEXT: flat_load_dwordx4 v[28:31], v[0:1] offset:12 +; GFX942-NEXT: flat_load_dwordx4 v[32:35], v[0:1] offset:28 +; GFX942-NEXT: flat_load_dwordx4 v[36:39], v[0:1] offset:140 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[4:7] offset:96 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[8:11] offset:112 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[12:15] offset:64 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[16:19] offset:80 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[20:23] offset:32 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[24:27] offset:48 +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[28:31] +; GFX942-NEXT: flat_store_dwordx4 v[2:3], v[32:35] offset:16 +; GFX942-NEXT: flat_store_dwordx3 v[2:3], v[36:38] offset:128 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_inbounds_very_wide: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x8 +; GFX11-NEXT: flat_load_b128 v[4:7], v[0:1] offset:108 +; GFX11-NEXT: flat_load_b128 v[8:11], v[0:1] offset:124 +; GFX11-NEXT: flat_load_b128 v[12:15], v[0:1] offset:76 +; GFX11-NEXT: flat_load_b128 v[16:19], v[0:1] offset:92 +; GFX11-NEXT: flat_load_b128 v[20:23], v[0:1] offset:44 +; GFX11-NEXT: flat_load_b128 v[24:27], v[0:1] offset:60 +; GFX11-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX11-NEXT: flat_load_b128 v[32:35], v[0:1] offset:28 +; GFX11-NEXT: flat_load_b128 v[36:39], v[0:1] offset:140 +; GFX11-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[4:7] offset:96 +; GFX11-NEXT: s_waitcnt vmcnt(7) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[8:11] offset:112 +; GFX11-NEXT: s_waitcnt vmcnt(6) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[12:15] offset:64 +; GFX11-NEXT: s_waitcnt vmcnt(5) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[16:19] offset:80 +; GFX11-NEXT: s_waitcnt vmcnt(4) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[24:27] offset:48 +; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(8) +; GFX11-NEXT: flat_store_b128 v[2:3], v[32:35] offset:16 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(8) +; GFX11-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_inbounds_very_wide: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[4:5] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo +; GFX12-NEXT: s_clause 0x8 +; GFX12-NEXT: flat_load_b128 v[4:7], v[0:1] offset:108 +; GFX12-NEXT: flat_load_b128 v[8:11], v[0:1] offset:124 +; GFX12-NEXT: flat_load_b128 v[12:15], v[0:1] offset:76 +; GFX12-NEXT: flat_load_b128 v[16:19], v[0:1] offset:92 +; GFX12-NEXT: flat_load_b128 v[20:23], v[0:1] offset:44 +; GFX12-NEXT: flat_load_b128 v[24:27], v[0:1] offset:60 +; GFX12-NEXT: flat_load_b128 v[28:31], v[0:1] offset:12 +; GFX12-NEXT: flat_load_b128 v[32:35], v[0:1] offset:28 +; GFX12-NEXT: flat_load_b128 v[36:39], v[0:1] offset:140 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x808 +; GFX12-NEXT: flat_store_b128 v[2:3], v[4:7] offset:96 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x708 +; GFX12-NEXT: flat_store_b128 v[2:3], v[8:11] offset:112 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x608 +; GFX12-NEXT: flat_store_b128 v[2:3], v[12:15] offset:64 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x508 +; GFX12-NEXT: flat_store_b128 v[2:3], v[16:19] offset:80 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x408 +; GFX12-NEXT: flat_store_b128 v[2:3], v[20:23] offset:32 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x308 +; GFX12-NEXT: flat_store_b128 v[2:3], v[24:27] offset:48 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x208 +; GFX12-NEXT: flat_store_b128 v[2:3], v[28:31] +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x108 +; GFX12-NEXT: flat_store_b128 v[2:3], v[32:35] offset:16 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x8 +; GFX12-NEXT: flat_store_b96 v[2:3], v[36:38] offset:128 +; GFX12-NEXT: s_wait_dscnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_setpc_b64 s[30:31] + %p.1 = getelementptr inbounds i32, ptr %p, i32 %i + %arrayidx = getelementptr inbounds i32, ptr %p.1, i32 3 + %l = load <35 x i32>, ptr %arrayidx + store <35 x i32> %l, ptr %pout + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 75d2f156bdd2c..985917988e919 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -31,7 +31,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { - %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void @@ -57,7 +57,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) nounw ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) nounwind { - %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -80,7 +80,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GFX9: global_atomic_dec v{{[0-9]+}}, [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void @@ -104,7 +104,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) no ; GFX9-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GFX9: global_atomic_dec [[ZERO]], [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) nounwind { - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -115,9 +115,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) % ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id - %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %id + %gep = getelementptr inbounds i32, ptr addrspace(1) %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out.gep ret void @@ -129,8 +129,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr addrspace(1) %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -149,7 +149,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #0 { ; CIVI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 { - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void @@ -168,7 +168,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) nounwind { ; CIVI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind { - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -179,9 +179,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind { ; GFX9: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr %out, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out.gep ret void @@ -193,8 +193,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -215,7 +215,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 { ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 { - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void @@ -236,7 +236,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind { ; CIVI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -248,9 +248,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind { ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}} define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr %out, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out.gep ret void @@ -263,8 +263,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}} define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -280,7 +280,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 + %arrayidx0 = getelementptr inbounds inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false) store i32 %idx.0, ptr addrspace(1) %add_use store i32 %val0, ptr addrspace(1) %out @@ -308,7 +308,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32 define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { - %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void @@ -334,7 +334,7 @@ define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) nounw ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_dec_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) nounwind { - %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -359,7 +359,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} ; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { - %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void @@ -383,7 +383,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) no ; CIVI: buffer_atomic_dec_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} ; GFX9: global_atomic_dec_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) nounwind { - %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -396,9 +396,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) % ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id - %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id - %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %id + %gep = getelementptr inbounds i64, ptr addrspace(1) %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out.gep ret void @@ -412,8 +412,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id - %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr addrspace(1) %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -429,7 +429,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 + %arrayidx0 = getelementptr inbounds inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false) store i32 %idx.0, ptr addrspace(1) %add_use store i64 %val0, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index b28405f4ff113..fe6d5832e6f1e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -31,7 +31,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { - %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void @@ -57,7 +57,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 42 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) nounwind { - %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(3) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -77,7 +77,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} ; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out ret void @@ -97,7 +97,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) no ; CIVI: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} ; GFX9: global_atomic_inc v{{[0-9]+}}, [[K]], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) nounwind { - %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -108,9 +108,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) % ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id - %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %id + %gep = getelementptr inbounds i32, ptr addrspace(1) %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr addrspace(1) %out.gep ret void @@ -122,8 +122,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id - %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr addrspace(1) %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr addrspace(1) %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -136,7 +136,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 + %arrayidx0 = getelementptr inbounds inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0 %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false) store i32 %idx.0, ptr addrspace(1) %add_use store i32 %val0, ptr addrspace(1) %out @@ -158,7 +158,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32 define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 { - %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void @@ -178,7 +178,7 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; GCN: ds_inc_u64 v{{[0-9]+}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) nounwind { - %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(3) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -202,7 +202,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} ; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { - %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out ret void @@ -227,7 +227,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no ; CIVI: buffer_atomic_inc_x2 v[[[KLO]]:[[KHI]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} ; GFX9: global_atomic_inc_x2 v[[ZERO]], v[[[KLO]]:[[KHI]]], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) nounwind { - %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -240,9 +240,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) % ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id - %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id - %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i32 %id + %gep = getelementptr inbounds i64, ptr addrspace(1) %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr addrspace(1) %out.gep ret void @@ -256,8 +256,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]]{{$}} define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id - %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr addrspace(1) %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr addrspace(1) %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -276,7 +276,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 { ; CIVI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 { - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out ret void @@ -295,7 +295,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind { ; CIVI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:16{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { - %gep = getelementptr i32, ptr %ptr, i32 4 + %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -306,9 +306,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind { ; GFX9: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %out.gep = getelementptr i32, ptr %out, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i32, ptr %out, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) store i32 %result, ptr %out.gep ret void @@ -320,8 +320,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX9: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]] offset:20{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i32, ptr %ptr, i32 %id - %gep = getelementptr i32, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i32, ptr %ptr, i32 %id + %gep = getelementptr inbounds i32, ptr %gep.tid, i32 5 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false) ret void } @@ -334,7 +334,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 - %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 + %arrayidx0 = getelementptr inbounds inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0 %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false) store i32 %idx.0, ptr addrspace(1) %add_use store i64 %val0, ptr addrspace(1) %out @@ -357,7 +357,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 { ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] glc{{$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 { - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out ret void @@ -378,7 +378,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind { ; CIVI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]{{\]$}} ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:32{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { - %gep = getelementptr i64, ptr %ptr, i32 4 + %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } @@ -390,9 +390,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind { ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40 glc{{$}} define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %out.gep = getelementptr i64, ptr %out, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %out.gep = getelementptr inbounds i64, ptr %out, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) store i64 %result, ptr %out.gep ret void @@ -405,8 +405,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX9: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[[[KLO]]:[[KHI]]] offset:40{{$}} define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() - %gep.tid = getelementptr i64, ptr %ptr, i32 %id - %gep = getelementptr i64, ptr %gep.tid, i32 5 + %gep.tid = getelementptr inbounds i64, ptr %ptr, i32 %id + %gep = getelementptr inbounds i64, ptr %gep.tid, i32 5 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 874dece6b728d..0c55c91ba8dbd 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -11,18 +11,22 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-NEXT: s_mov_b32 s5, -1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-NEXT: .LBB0_2: ; %for.body ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-NEXT: s_add_co_i32 s6, s6, -1 ; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 -; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GFX12-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b128 v[4:5], v[0:3] ; GFX12-NEXT: s_cbranch_scc1 .LBB0_2 @@ -37,17 +41,20 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s ; GFX12-SPREFETCH-NEXT: s_cbranch_scc1 .LBB0_3 ; GFX12-SPREFETCH-NEXT: ; %bb.1: ; %for.body.preheader ; GFX12-SPREFETCH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-SPREFETCH-NEXT: s_movk_i32 s4, 0xff50 +; GFX12-SPREFETCH-NEXT: s_mov_b32 s5, -1 ; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 ; GFX12-SPREFETCH-NEXT: .LBB0_2: ; %for.body ; GFX12-SPREFETCH-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-SPREFETCH-NEXT: s_wait_alu 0xfffe -; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[8:9], s[2:3], s[4:5] ; GFX12-SPREFETCH-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX12-SPREFETCH-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 ; GFX12-SPREFETCH-NEXT: s_add_co_i32 s6, s6, -1 -; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GFX12-SPREFETCH-NEXT: flat_load_b128 v[0:3], v[0:1] ; GFX12-SPREFETCH-NEXT: s_cmp_lg_u32 s6, 0 ; GFX12-SPREFETCH-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 ; GFX12-SPREFETCH-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 565fce0e7abde..9ddf87aa39140 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -16031,6 +16031,241 @@ entry: ret void } +define void @memset_p0_sz19(ptr addrspace(0) %dst) { +; CHECK-LABEL: memset_p0_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x41 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_mov_b32_e32 v7, 0x4141 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v5, s5 +; CHECK-NEXT: v_mov_b32_e32 v3, s5 +; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 +; CHECK-NEXT: flat_store_short v[0:1], v7 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p0_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v2, 0x41 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:18 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:17 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:16 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:15 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:14 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:13 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:12 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:11 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:10 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:9 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:8 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:7 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:6 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:5 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:4 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:3 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:2 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 offset:1 +; ALIGNED-NEXT: flat_store_byte v[0:1], v2 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p0_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v6, 0x41 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v4, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v7, 0x4141 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v5, s5 +; UNROLL3-NEXT: v_mov_b32_e32 v3, s5 +; UNROLL3-NEXT: flat_store_byte v[0:1], v6 offset:18 +; UNROLL3-NEXT: flat_store_short v[0:1], v7 offset:16 +; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p1_sz19(ptr addrspace(1) %dst) { +; CHECK-LABEL: memset_p1_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dword v[0:1], v2, off offset:15 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p1_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v2, 0x41 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:18 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:17 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:16 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:15 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:14 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:13 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:12 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:11 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:10 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:9 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:8 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:7 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:6 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:5 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:4 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:3 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:2 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off offset:1 +; ALIGNED-NEXT: global_store_byte v[0:1], v2, off +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p1_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v2, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v3, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v4, v2 +; UNROLL3-NEXT: v_mov_b32_e32 v5, v2 +; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; UNROLL3-NEXT: global_store_dword v[0:1], v2, off offset:15 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p3_sz19(ptr addrspace(3) %dst) { +; CHECK-LABEL: memset_p3_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x41 +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: v_mov_b32_e32 v4, 0x4141 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: ds_write_b8 v0, v3 offset:18 +; CHECK-NEXT: ds_write_b16 v0, v4 offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p3_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:18 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:17 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:16 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:15 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:14 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:13 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:12 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:11 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:10 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:9 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:8 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:7 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:6 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:5 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:4 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:3 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:2 +; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:1 +; ALIGNED-NEXT: ds_write_b8 v0, v1 +; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p3_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v3, 0x41 +; UNROLL3-NEXT: s_mov_b32 s5, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v4, 0x4141 +; UNROLL3-NEXT: v_mov_b32_e32 v1, s4 +; UNROLL3-NEXT: v_mov_b32_e32 v2, s5 +; UNROLL3-NEXT: ds_write_b8 v0, v3 offset:18 +; UNROLL3-NEXT: ds_write_b16 v0, v4 offset:16 +; UNROLL3-NEXT: ds_write2_b64 v0, v[1:2], v[1:2] offset1:1 +; UNROLL3-NEXT: s_waitcnt lgkmcnt(0) +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} + +define void @memset_p5_sz19(ptr addrspace(5) %dst) { +; CHECK-LABEL: memset_p5_sz19: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, 0x41414141 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x41 +; CHECK-NEXT: v_mov_b32_e32 v3, 0x4141 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_setpc_b64 s[30:31] +; +; ALIGNED-LABEL: memset_p5_sz19: +; ALIGNED: ; %bb.0: ; %entry +; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; ALIGNED-NEXT: s_setpc_b64 s[30:31] +; +; UNROLL3-LABEL: memset_p5_sz19: +; UNROLL3: ; %bb.0: ; %entry +; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141 +; UNROLL3-NEXT: v_mov_b32_e32 v2, 0x41 +; UNROLL3-NEXT: v_mov_b32_e32 v3, 0x4141 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 +; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; UNROLL3-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:18 +; UNROLL3-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:16 +; UNROLL3-NEXT: s_setpc_b64 s[30:31] +entry: + tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull %dst, i8 65, i64 19, i1 false) + ret void +} declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2 declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2 @@ -16046,4 +16281,10 @@ declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr a declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2 +declare void @llvm.memset.p0.i64(ptr addrspace(0) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3 +declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3 + attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 5af37809443e0..961803218969c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -3011,7 +3011,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic ret void } @@ -3265,7 +3265,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic ret void } @@ -3512,7 +3512,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic ret void } @@ -3790,7 +3790,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic ret void } @@ -4068,7 +4068,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic ret void } @@ -4322,7 +4322,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire ret void } @@ -4576,7 +4576,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire ret void } @@ -4854,7 +4854,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire ret void } @@ -5132,7 +5132,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire ret void } @@ -5410,7 +5410,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire ret void } @@ -5688,7 +5688,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst ret void } @@ -5966,7 +5966,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst ret void } @@ -6244,7 +6244,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst ret void } @@ -6522,7 +6522,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst ret void } @@ -6800,7 +6800,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst ret void } @@ -7067,7 +7067,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7351,7 +7351,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7644,7 +7644,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7956,7 +7956,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8268,7 +8268,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8556,7 +8556,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8840,7 +8840,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9152,7 +9152,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9464,7 +9464,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9776,7 +9776,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10088,7 +10088,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10396,7 +10396,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10708,7 +10708,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11020,7 +11020,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11332,7 +11332,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -14376,7 +14376,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic ret void } @@ -14626,7 +14626,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic ret void } @@ -14873,7 +14873,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic ret void } @@ -15147,7 +15147,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic ret void } @@ -15421,7 +15421,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic ret void } @@ -15671,7 +15671,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire ret void } @@ -15921,7 +15921,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire ret void } @@ -16195,7 +16195,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire ret void } @@ -16469,7 +16469,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire ret void } @@ -16743,7 +16743,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire ret void } @@ -17017,7 +17017,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst ret void } @@ -17291,7 +17291,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst ret void } @@ -17565,7 +17565,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst ret void } @@ -17839,7 +17839,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst ret void } @@ -18113,7 +18113,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst ret void } @@ -18380,7 +18380,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18674,7 +18674,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18967,7 +18967,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19289,7 +19289,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19611,7 +19611,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19909,7 +19909,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20203,7 +20203,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20525,7 +20525,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20847,7 +20847,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21169,7 +21169,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21491,7 +21491,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21809,7 +21809,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22131,7 +22131,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22453,7 +22453,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22775,7 +22775,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b80dfaea01653..1035684a39bce 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -2641,7 +2641,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic ret void } @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic ret void } @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic ret void } @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic ret void } @@ -3533,7 +3533,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic ret void } @@ -3756,7 +3756,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire ret void } @@ -3979,7 +3979,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire ret void } @@ -4202,7 +4202,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire ret void } @@ -4425,7 +4425,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire ret void } @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire ret void } @@ -4871,7 +4871,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst ret void } @@ -5094,7 +5094,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst ret void } @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst ret void } @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst ret void } @@ -5763,7 +5763,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst ret void } @@ -6030,7 +6030,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6299,7 +6299,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6568,7 +6568,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6837,7 +6837,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7106,7 +7106,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7375,7 +7375,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7644,7 +7644,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7913,7 +7913,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8182,7 +8182,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8451,7 +8451,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8720,7 +8720,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8989,7 +8989,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9258,7 +9258,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9527,7 +9527,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9796,7 +9796,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12432,7 +12432,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic ret void } @@ -12655,7 +12655,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic ret void } @@ -12878,7 +12878,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic ret void } @@ -13101,7 +13101,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic ret void } @@ -13324,7 +13324,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic ret void } @@ -13547,7 +13547,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire ret void } @@ -13770,7 +13770,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire ret void } @@ -13993,7 +13993,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire ret void } @@ -14216,7 +14216,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire ret void } @@ -14439,7 +14439,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire ret void } @@ -14662,7 +14662,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst ret void } @@ -14885,7 +14885,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst ret void } @@ -15108,7 +15108,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst ret void } @@ -15331,7 +15331,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst ret void } @@ -15554,7 +15554,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst ret void } @@ -15821,7 +15821,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16090,7 +16090,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16359,7 +16359,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16628,7 +16628,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16897,7 +16897,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17166,7 +17166,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17435,7 +17435,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17704,7 +17704,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17973,7 +17973,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18242,7 +18242,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18511,7 +18511,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18780,7 +18780,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19049,7 +19049,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19318,7 +19318,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19587,7 +19587,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 1ec942ea5f47b..6355c4d023e8c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -3055,7 +3055,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic ret void } @@ -3311,7 +3311,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic ret void } @@ -3562,7 +3562,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic ret void } @@ -3846,7 +3846,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic ret void } @@ -4130,7 +4130,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic ret void } @@ -4386,7 +4386,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire ret void } @@ -4642,7 +4642,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire ret void } @@ -4926,7 +4926,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire ret void } @@ -5210,7 +5210,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire ret void } @@ -5494,7 +5494,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire ret void } @@ -5778,7 +5778,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst ret void } @@ -6062,7 +6062,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst ret void } @@ -6346,7 +6346,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst ret void } @@ -6630,7 +6630,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst ret void } @@ -6914,7 +6914,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst ret void } @@ -7181,7 +7181,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7467,7 +7467,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7764,7 +7764,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8082,7 +8082,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8400,7 +8400,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8690,7 +8690,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8976,7 +8976,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9294,7 +9294,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9612,7 +9612,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9930,7 +9930,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10248,7 +10248,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10562,7 +10562,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -10880,7 +10880,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11198,7 +11198,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -11516,7 +11516,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -14604,7 +14604,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic ret void } @@ -14856,7 +14856,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic ret void } @@ -15107,7 +15107,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic ret void } @@ -15387,7 +15387,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic ret void } @@ -15667,7 +15667,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic ret void } @@ -15919,7 +15919,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire ret void } @@ -16171,7 +16171,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire ret void } @@ -16451,7 +16451,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire ret void } @@ -16731,7 +16731,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire ret void } @@ -17011,7 +17011,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire ret void } @@ -17291,7 +17291,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst ret void } @@ -17571,7 +17571,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst ret void } @@ -17851,7 +17851,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst ret void } @@ -18131,7 +18131,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst ret void } @@ -18411,7 +18411,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst ret void } @@ -18678,7 +18678,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18974,7 +18974,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19271,7 +19271,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19599,7 +19599,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19927,7 +19927,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20227,7 +20227,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20523,7 +20523,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20851,7 +20851,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21179,7 +21179,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21507,7 +21507,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -21835,7 +21835,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22159,7 +22159,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22487,7 +22487,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -22815,7 +22815,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -23143,7 +23143,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 588f06f1be054..84d68f213a570 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -2641,7 +2641,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic ret void } @@ -2864,7 +2864,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic ret void } @@ -3087,7 +3087,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic ret void } @@ -3310,7 +3310,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic ret void } @@ -3533,7 +3533,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic ret void } @@ -3756,7 +3756,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire ret void } @@ -3979,7 +3979,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire ret void } @@ -4202,7 +4202,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire ret void } @@ -4425,7 +4425,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire ret void } @@ -4648,7 +4648,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire ret void } @@ -4871,7 +4871,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst ret void } @@ -5094,7 +5094,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst ret void } @@ -5317,7 +5317,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst ret void } @@ -5540,7 +5540,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst ret void } @@ -5763,7 +5763,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst ret void } @@ -6030,7 +6030,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6299,7 +6299,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6568,7 +6568,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6837,7 +6837,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7106,7 +7106,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7375,7 +7375,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7644,7 +7644,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7913,7 +7913,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8182,7 +8182,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8451,7 +8451,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8720,7 +8720,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8989,7 +8989,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9258,7 +9258,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9527,7 +9527,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9796,7 +9796,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12432,7 +12432,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic ret void } @@ -12655,7 +12655,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic ret void } @@ -12878,7 +12878,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic ret void } @@ -13101,7 +13101,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic ret void } @@ -13324,7 +13324,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic ret void } @@ -13547,7 +13547,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire ret void } @@ -13770,7 +13770,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire ret void } @@ -13993,7 +13993,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire ret void } @@ -14216,7 +14216,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire ret void } @@ -14439,7 +14439,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire ret void } @@ -14662,7 +14662,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst ret void } @@ -14885,7 +14885,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst ret void } @@ -15108,7 +15108,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst ret void } @@ -15331,7 +15331,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst ret void } @@ -15554,7 +15554,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst ret void } @@ -15821,7 +15821,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16090,7 +16090,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16359,7 +16359,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16628,7 +16628,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16897,7 +16897,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17166,7 +17166,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17435,7 +17435,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17704,7 +17704,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17973,7 +17973,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18242,7 +18242,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18511,7 +18511,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18780,7 +18780,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19049,7 +19049,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19318,7 +19318,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index ee7d79a8a8cbb..b187fcfc2f3d2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -2900,7 +2900,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic ret void } @@ -3142,7 +3142,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic ret void } @@ -3382,7 +3382,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic ret void } @@ -3641,7 +3641,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic ret void } @@ -3900,7 +3900,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic ret void } @@ -4142,7 +4142,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire ret void } @@ -4384,7 +4384,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire ret void } @@ -4643,7 +4643,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire ret void } @@ -4902,7 +4902,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire ret void } @@ -5161,7 +5161,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire ret void } @@ -5420,7 +5420,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst ret void } @@ -5687,7 +5687,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -5968,7 +5968,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6254,7 +6254,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6554,7 +6554,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -6854,7 +6854,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7137,7 +7137,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7418,7 +7418,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -7718,7 +7718,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8018,7 +8018,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8318,7 +8318,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8618,7 +8618,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -8916,7 +8916,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9216,7 +9216,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9516,7 +9516,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -9816,7 +9816,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -12608,7 +12608,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic ret void } @@ -12841,7 +12841,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic ret void } @@ -13074,7 +13074,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic ret void } @@ -13317,7 +13317,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic ret void } @@ -13560,7 +13560,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic ret void } @@ -13793,7 +13793,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire ret void } @@ -14026,7 +14026,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire ret void } @@ -14269,7 +14269,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire ret void } @@ -14512,7 +14512,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire ret void } @@ -14755,7 +14755,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire ret void } @@ -14998,7 +14998,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst ret void } @@ -15241,7 +15241,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst ret void } @@ -15484,7 +15484,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst ret void } @@ -15727,7 +15727,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst ret void } @@ -15970,7 +15970,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst ret void } @@ -16237,7 +16237,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16514,7 +16514,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -16793,7 +16793,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17082,7 +17082,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17371,7 +17371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17650,7 +17650,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -17927,7 +17927,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18216,7 +18216,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18505,7 +18505,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -18794,7 +18794,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19083,7 +19083,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19370,7 +19370,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19659,7 +19659,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -19948,7 +19948,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 @@ -20237,7 +20237,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: - %gep = getelementptr i32, ptr %out, i32 4 + %gep = getelementptr inbounds i32, ptr %out, i32 4 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst %val0 = extractvalue { i32, i1 } %val, 0 store i32 %val0, ptr %out, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll index 1ecf8f228c625..81687a52fa14b 100644 --- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll @@ -45,7 +45,7 @@ define i8 @flat_inst_valu_offset_1(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 1 + %gep = getelementptr inbounds i8, ptr %p, i64 1 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -84,7 +84,7 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 2047 + %gep = getelementptr inbounds i8, ptr %p, i64 2047 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -123,7 +123,7 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 4095 + %gep = getelementptr inbounds i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -184,7 +184,7 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8191 + %gep = getelementptr inbounds i8, ptr %p, i64 8191 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -245,7 +245,7 @@ define i8 @flat_inst_valu_offset_24bit_max(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8388607 + %gep = getelementptr inbounds i8, ptr %p, i64 8388607 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -288,7 +288,7 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -2048 + %gep = getelementptr inbounds i8, ptr %p, i64 -2048 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -331,7 +331,7 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -4096 + %gep = getelementptr inbounds i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -374,7 +374,7 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -8192 + %gep = getelementptr inbounds i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -417,7 +417,7 @@ define i8 @flat_inst_valu_offset_neg_24bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8388608 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -8388608 + %gep = getelementptr inbounds i8, ptr %p, i64 -8388608 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -457,7 +457,7 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 4095 + %gep = getelementptr inbounds i8, ptr %p, i64 4095 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -518,7 +518,7 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8191 + %gep = getelementptr inbounds i8, ptr %p, i64 8191 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -579,7 +579,7 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 16383 + %gep = getelementptr inbounds i8, ptr %p, i64 16383 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -659,7 +659,7 @@ define i8 @flat_inst_valu_offset_2x_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 16777214 + %gep = getelementptr inbounds i8, ptr %p, i64 16777214 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -702,7 +702,7 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -4096 + %gep = getelementptr inbounds i8, ptr %p, i64 -4096 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -745,7 +745,7 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -8192 + %gep = getelementptr inbounds i8, ptr %p, i64 -8192 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -788,7 +788,7 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) { ; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -16384 + %gep = getelementptr inbounds i8, ptr %p, i64 -16384 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -850,7 +850,7 @@ define i8 @flat_inst_valu_offset_2x_neg_24bit_max(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -16777215 + %gep = getelementptr inbounds i8, ptr %p, i64 -16777215 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -931,7 +931,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589936639 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936639 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1012,7 +1012,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589936640 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936640 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1093,7 +1093,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589938687 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938687 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1156,7 +1156,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589938688 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938688 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1237,7 +1237,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589942783 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942783 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1300,7 +1300,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 8589942784 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942784 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1374,7 +1374,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773761 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1448,7 +1448,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773760 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1522,7 +1522,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771713 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1596,7 +1596,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771712 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1670,7 +1670,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767617 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1744,7 +1744,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31] - %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767616 %load = load i8, ptr %gep, align 4 ret i8 %load } @@ -1793,7 +1793,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 1 + %gep = getelementptr inbounds i8, ptr %p, i64 1 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -1843,7 +1843,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 2047 + %gep = getelementptr inbounds i8, ptr %p, i64 2047 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -1893,7 +1893,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 4095 + %gep = getelementptr inbounds i8, ptr %p, i64 4095 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -1973,7 +1973,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8191 + %gep = getelementptr inbounds i8, ptr %p, i64 8191 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2053,7 +2053,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -2048 + %gep = getelementptr inbounds i8, ptr %p, i64 -2048 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2133,7 +2133,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -4096 + %gep = getelementptr inbounds i8, ptr %p, i64 -4096 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2213,7 +2213,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -8192 + %gep = getelementptr inbounds i8, ptr %p, i64 -8192 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2263,7 +2263,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 4095 + %gep = getelementptr inbounds i8, ptr %p, i64 4095 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2343,7 +2343,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8191 + %gep = getelementptr inbounds i8, ptr %p, i64 8191 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2423,7 +2423,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 16383 + %gep = getelementptr inbounds i8, ptr %p, i64 16383 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2503,7 +2503,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -4096 + %gep = getelementptr inbounds i8, ptr %p, i64 -4096 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2583,7 +2583,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -8192 + %gep = getelementptr inbounds i8, ptr %p, i64 -8192 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2663,7 +2663,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) { ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX11-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -16384 + %gep = getelementptr inbounds i8, ptr %p, i64 -16384 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2758,7 +2758,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589936639 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936639 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2853,7 +2853,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589936640 + %gep = getelementptr inbounds i8, ptr %p, i64 8589936640 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -2948,7 +2948,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589938687 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938687 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3044,7 +3044,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589938688 + %gep = getelementptr inbounds i8, ptr %p, i64 8589938688 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3140,7 +3140,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589942783 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942783 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3236,7 +3236,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) { ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 8589942784 + %gep = getelementptr inbounds i8, ptr %p, i64 8589942784 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3335,7 +3335,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854773761 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773761 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3434,7 +3434,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854773760 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854773760 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3533,7 +3533,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854771713 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771713 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3632,7 +3632,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854771712 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854771712 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3731,7 +3731,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854767617 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767617 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void @@ -3830,7 +3830,7 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-GISEL-NEXT: flat_store_b8 v[0:1], v0 ; GFX12-GISEL-NEXT: s_endpgm - %gep = getelementptr i8, ptr %p, i64 -9223372036854767616 + %gep = getelementptr inbounds i8, ptr %p, i64 -9223372036854767616 %load = load volatile i8, ptr %gep, align 1 store i8 %load, ptr poison ret void diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index afb0ab958954c..7dcbf1f017c27 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -2635,7 +2635,7 @@ define amdgpu_kernel void @negativeoffsetnullptr(ptr %buffer) { ; GFX11-NEXT: s_endpgm entry: %null = select i1 false, ptr %buffer, ptr addrspacecast (ptr addrspace(5) null to ptr) - %gep = getelementptr i8, ptr %null, i64 -1 + %gep = getelementptr inbounds i8, ptr %null, i64 -1 %ld = load i8, ptr %gep %cmp = icmp eq i8 %ld, 0 br label %branch diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 39af91b81110d..60aac9ad56f06 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -13,9 +13,9 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s2, s0 ; CHECK-NEXT: s_addc_u32 s1, s3, s1 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol