Skip to content

Commit

Permalink
AMDGPU: Remove ds_fmin/ds_fmax intrinsics
Browse files Browse the repository at this point in the history
These have been replaced with atomicrmw.
  • Loading branch information
arsenm committed Jun 27, 2024
1 parent 8f63d15 commit 906c8fe
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 2,154 deletions.
5 changes: 5 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ Changes to the AMDGPU Backend

* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`

* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
`fmax` with addrspace(3) instead.

Changes to the ARM Backend
--------------------------

Expand Down
14 changes: 0 additions & 14 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
[IntrNoMem, IntrSpeculatable]
>;

class AMDGPULDSIntrin :
Intrinsic<[llvm_any_ty],
[LLVMQualPointerType<3>,
LLVMMatchType<0>,
llvm_i32_ty, // ordering
llvm_i32_ty, // scope
llvm_i1_ty], // isVolatile
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
>;

// FIXME: The m0 argument should be moved after the normal arguments
class AMDGPUDSOrderedIntrinsic : Intrinsic<
[llvm_i32_ty],
Expand Down Expand Up @@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
def int_amdgcn_ds_fmax : AMDGPULDSIntrin;

} // TargetPrefix = "amdgcn"

// New-style image intrinsics
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
break; // No other 'amdgcn.atomic.*'
}

if (Name.starts_with("ds.fadd")) {
// Replaced with atomicrmw fadd, so there's no new declaration.
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
Name.starts_with("ds.fmax")) {
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
// declaration.
NewFn = nullptr;
return true;
}
Expand Down Expand Up @@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
AtomicRMWInst::BinOp RMWOp =
StringSwitch<AtomicRMWInst::BinOp>(Name)
.StartsWith("ds.fadd", AtomicRMWInst::FAdd)
.StartsWith("ds.fmin", AtomicRMWInst::FMin)
.StartsWith("ds.fmax", AtomicRMWInst::FMax)
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);

Expand Down
32 changes: 0 additions & 32 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5401,35 +5401,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
return true;
}

static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
case Intrinsic::amdgcn_ds_fmin:
return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
}

bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI,
Intrinsic::ID IID) const {
GISelChangeObserver &Observer = Helper.Observer;
Observer.changingInstr(MI);

MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));

// The remaining operands were used to set fields in the MemOperand on
// construction.
for (int I = 6; I > 3; --I)
MI.removeOperand(I);

MI.removeOperand(1); // Remove the intrinsic ID.
Observer.changedInstr(MI);
return true;
}

// TODO: Fix pointer type handling
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstr &MI,
Expand Down Expand Up @@ -7451,9 +7422,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI, Intrinsic::ID IID) const;

bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
Expand Down
20 changes: 1 addition & 19 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
case Intrinsic::amdgcn_ds_ordered_swap: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
if (!Ordering || !Volatile)
Expand Down Expand Up @@ -1018,8 +1016,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
case Intrinsic::amdgcn_flat_atomic_fadd:
Expand All @@ -1039,20 +1035,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile->isZero())
return nullptr;
Module *M = II->getParent()->getParent()->getParent();
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
Function *NewDecl =
Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
return II;
}
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
Expand Down
15 changes: 1 addition & 14 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,9 +1279,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
case Intrinsic::amdgcn_ds_ordered_swap: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Expand Down Expand Up @@ -1450,8 +1448,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fadd:
Expand Down Expand Up @@ -8899,15 +8895,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
: ISD::ATOMIC_LOAD_FMAX;
return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
M->getOperand(2), M->getOperand(3),
M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
Expand Down
52 changes: 52 additions & 0 deletions llvm/test/Bitcode/amdgcn-atomic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,4 +248,56 @@ define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspa
ret <2 x i16> %result0
}

declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmin_f32(ptr addrspace(3) %ptr, float %val) {
; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result0 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)

; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
%result3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)

ret float %result3
}

define double @upgrade_amdgcn_ds_fmin_f64(ptr addrspace(3) %ptr, double %val) {
; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result0 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)

; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
%result3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)

ret double %result3
}

declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmin_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4

%result0 = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
ret float %result0
}

declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
; CHECK: = atomicrmw fmax ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result0 = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
ret float %result0
}

attributes #0 = { argmemonly nounwind willreturn }
Loading

0 comments on commit 906c8fe

Please sign in to comment.