Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMDGPU: Remove ds_fmin/ds_fmax intrinsics #96739

Merged
merged 1 commit into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ Changes to the AMDGPU Backend

* Implemented :ref:`llvm.get.rounding <int_get_rounding>` and :ref:`llvm.set.rounding <int_set_rounding>`

* Removed ``llvm.amdgcn.ds.fadd``, ``llvm.amdgcn.ds.fmin`` and
``llvm.amdgcn.ds.fmax`` intrinsics. Users should use the
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
`fmax` with addrspace(3) instead.

Changes to the ARM Backend
--------------------------

Expand Down
14 changes: 0 additions & 14 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -523,17 +523,6 @@ def int_amdgcn_fmad_ftz :
[IntrNoMem, IntrSpeculatable]
>;

class AMDGPULDSIntrin :
Intrinsic<[llvm_any_ty],
[LLVMQualPointerType<3>,
LLVMMatchType<0>,
llvm_i32_ty, // ordering
llvm_i32_ty, // scope
llvm_i1_ty], // isVolatile
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]
>;

// FIXME: The m0 argument should be moved after the normal arguments
class AMDGPUDSOrderedIntrinsic : Intrinsic<
[llvm_i32_ty],
Expand Down Expand Up @@ -571,9 +560,6 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

def int_amdgcn_ds_fmin : AMDGPULDSIntrin;
def int_amdgcn_ds_fmax : AMDGPULDSIntrin;

} // TargetPrefix = "amdgcn"

// New-style image intrinsics
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,8 +1033,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
break; // No other 'amdgcn.atomic.*'
}

if (Name.starts_with("ds.fadd")) {
// Replaced with atomicrmw fadd, so there's no new declaration.
if (Name.starts_with("ds.fadd") || Name.starts_with("ds.fmin") ||
Name.starts_with("ds.fmax")) {
// Replaced with atomicrmw fadd/fmin/fmax, so there's no new
// declaration.
NewFn = nullptr;
return true;
}
Expand Down Expand Up @@ -2347,6 +2349,8 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
AtomicRMWInst::BinOp RMWOp =
StringSwitch<AtomicRMWInst::BinOp>(Name)
.StartsWith("ds.fadd", AtomicRMWInst::FAdd)
.StartsWith("ds.fmin", AtomicRMWInst::FMin)
.StartsWith("ds.fmax", AtomicRMWInst::FMax)
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap);

Expand Down
32 changes: 0 additions & 32 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5401,35 +5401,6 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
return true;
}

static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
switch (IID) {
case Intrinsic::amdgcn_ds_fmin:
return AMDGPU::G_ATOMICRMW_FMIN;
case Intrinsic::amdgcn_ds_fmax:
return AMDGPU::G_ATOMICRMW_FMAX;
default:
llvm_unreachable("not a DS FP intrinsic");
}
}

bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI,
Intrinsic::ID IID) const {
GISelChangeObserver &Observer = Helper.Observer;
Observer.changingInstr(MI);

MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));

// The remaining operands were used to set fields in the MemOperand on
// construction.
for (int I = 6; I > 3; --I)
MI.removeOperand(I);

MI.removeOperand(1); // Remove the intrinsic ID.
Observer.changedInstr(MI);
return true;
}

// TODO: Fix pointer type handling
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstr &MI,
Expand Down Expand Up @@ -7451,9 +7422,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::amdgcn_rsq_clamp:
return legalizeRsqClampIntrinsic(MI, MRI, B);
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI, Intrinsic::ID IID) const;

bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,6 @@ def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
Expand Down
20 changes: 1 addition & 19 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -501,9 +501,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
case Intrinsic::amdgcn_ds_ordered_swap: {
auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
if (!Ordering || !Volatile)
Expand Down Expand Up @@ -1018,8 +1016,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
case Intrinsic::amdgcn_flat_atomic_fadd:
Expand All @@ -1039,20 +1035,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
if (!IsVolatile->isZero())
return nullptr;
Module *M = II->getParent()->getParent()->getParent();
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
Function *NewDecl =
Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
II->setArgOperand(0, NewV);
II->setCalledFunction(NewDecl);
return II;
}
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private: {
unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
Expand Down
15 changes: 1 addition & 14 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,9 +1279,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
case Intrinsic::amdgcn_ds_ordered_swap: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
Expand Down Expand Up @@ -1450,8 +1448,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_fmax:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_flat_atomic_fadd:
Expand Down Expand Up @@ -8899,15 +8895,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc = IntrID == Intrinsic::amdgcn_ds_fmin ? ISD::ATOMIC_LOAD_FMIN
: ISD::ATOMIC_LOAD_FMAX;
return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0),
M->getOperand(2), M->getOperand(3),
M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
Expand Down
52 changes: 52 additions & 0 deletions llvm/test/Bitcode/amdgcn-atomic.ll
Original file line number Diff line number Diff line change
Expand Up @@ -248,4 +248,56 @@ define <2 x i16> @upgrade_amdgcn_ds_fadd_v2bf16__missing_args_as_i16(ptr addrspa
ret <2 x i16> %result0
}

declare float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)
declare double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) nocapture, double, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmin_f32(ptr addrspace(3) %ptr, float %val) {
; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result0 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)

; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result1 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 true)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result2 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 43, i32 3, i1 false)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") acquire, align 4
%result3 = call float @llvm.amdgcn.ds.fmin.f32(ptr addrspace(3) %ptr, float %val, i32 4, i32 2, i1 false)

ret float %result3
}

define double @upgrade_amdgcn_ds_fmin_f64(ptr addrspace(3) %ptr, double %val) {
; CHECK: atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result0 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 false)

; CHECK: = atomicrmw volatile fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result1 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 0, i32 0, i1 true)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") seq_cst, align 8
%result2 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 43, i32 3, i1 false)

; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, double %val syncscope("agent") acquire, align 8
%result3 = call double @llvm.amdgcn.ds.fmin.f64(ptr addrspace(3) %ptr, double %val, i32 4, i32 2, i1 false)

ret double %result3
}

declare float @llvm.amdgcn.ds.fmin(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmin_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
; CHECK: = atomicrmw fmin ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4

%result0 = call float @llvm.amdgcn.ds.fmin(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
ret float %result0
}

declare float @llvm.amdgcn.ds.fmax(ptr addrspace(3) nocapture, float, i32 immarg, i32 immarg, i1 immarg)

define float @upgrade_amdgcn_ds_fmax_f32_no_suffix(ptr addrspace(3) %ptr, float %val) {
; CHECK: = atomicrmw fmax ptr addrspace(3) %ptr, float %val syncscope("agent") seq_cst, align 4
%result0 = call float @llvm.amdgcn.ds.fmax(ptr addrspace(3) %ptr, float %val, i32 0, i32 0, i1 false)
ret float %result0
}

attributes #0 = { argmemonly nounwind willreturn }
Loading
Loading