Skip to content

Commit

Permalink
[AMDGPU] Add intrinsics for atomic struct buffer loads (#100140)
Browse files Browse the repository at this point in the history
Summary:
Mark these intrinsics as atomic loads within LLVM to prevent hoisting
out of loops in cases where
the load is considered invariant.

Similar to #97707, but for
struct buffer loads.

Test Plan: 

Reviewers: 

Subscribers: 

Tasks: 

Tags: 


Differential Revision: https://phabricator.intern.facebook.com/D60250668
  • Loading branch information
OutOfCache authored and yuxuanchen1997 committed Jul 25, 2024
1 parent 94e36f6 commit 9c03782
Show file tree
Hide file tree
Showing 6 changed files with 772 additions and 3 deletions.
35 changes: 35 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,23 @@ class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad;
def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad;

class AMDGPUStructAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
[llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
[ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_atomic_buffer_load : AMDGPUStructAtomicBufferLoad;

class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[data_ty],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
Expand All @@ -1219,6 +1236,24 @@ class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIn
def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad;
def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad;

class AMDGPUStructPtrAtomicBufferLoad<LLVMType data_ty = llvm_any_ty> : Intrinsic <
[data_ty],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
llvm_i32_ty], // auxiliary/cachepolicy(imm):
// bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11),
// bit 3 = swz, bit 4 = scc (gfx90a)
// gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
// gfx12+: bits [0-2] = th, bits [3-4] = scope,
// bit 6 = swz
// all: volatile op (bit 31, stripped at lowering)
[IntrArgMemOnly, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_ptr_atomic_buffer_load : AMDGPUStructPtrAtomicBufferLoad;

class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
[],
[data_ty, // vdata(VGPR)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7375,6 +7375,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_atomic_buffer_load:
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
return legalizeBufferLoad(MI, MRI, B, false, false);
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5020,7 +5020,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
case Intrinsic::amdgcn_struct_atomic_buffer_load:
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1278,7 +1278,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
case Intrinsic::amdgcn_raw_atomic_buffer_load:
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: {
case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
case Intrinsic::amdgcn_struct_atomic_buffer_load:
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
Info.memVT =
memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(),
std::numeric_limits<unsigned>::max());
Expand Down Expand Up @@ -8925,7 +8927,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format:
case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_atomic_buffer_load:
case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
const bool IsFormat =
IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
Expand Down
Loading

0 comments on commit 9c03782

Please sign in to comment.