-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets #67461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc ChangesIn order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it. Full diff: https://github.com/llvm/llvm-project/pull/67461.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d5356d1be3d758a..924144c5366f37e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
"Support DPP (Data Parallel Primitives) extension in DP ALU"
>;
+def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
+ "HasDPPSrc1SGPR",
+ "true",
+ "Support SGPR for Src1 of DPP instructions"
+>;
+
def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
"HasPackedFP32Ops",
"true",
@@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<
def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureSALUFloatInsts])>;
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR])>;
def FeatureISAVersion11_5_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
FeatureGFX11FullVGPRs])>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 5032dd665b07e81..35656bcaea1af7f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
- if (DppCtrlIdx < 0)
- return true;
- unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+ if (DppCtrlIdx >= 0) {
+ unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+
+ if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
+ AMDGPU::isDPALU_DPP(MII.get(Opc))) {
+ // DP ALU DPP is supported for row_newbcast only on GFX9*
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
+ Error(S, "DP ALU dpp only supports row_newbcast");
+ return false;
+ }
+ }
- if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
- AMDGPU::isDPALU_DPP(MII.get(Opc))) {
- // DP ALU DPP is supported for row_newbcast only on GFX9*
- SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
- Error(S, "DP ALU dpp only supports row_newbcast");
- return false;
+ int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
+ bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
+
+ if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx >= 0) {
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (Src1.isImm() ||
+ (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
+ Error(Op.getStartLoc(), "invalid operand for instruction");
+ return false;
+ }
+ }
}
return true;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e1cd0f0a732b93e..3e516a6e40b372a 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -278,6 +278,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
+ int Src0Idx = NumOperands;
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
Fail = true;
@@ -301,7 +302,14 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
- if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+ int OpNum = NumOperands;
+ // If subtarget does not support SGPRs for src1 operand then the
+ // requirements are the same as for src0. We check src0 instead because
+ // pseudos are shared between subtargets and allow SGPR for src1 on all.
+ if (!ST->hasDPPSrc1SGPR())
+ OpNum = Src0Idx;
+
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
break;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index f60938d830fcc43..c73cf6b2bd815ff 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDPP = false;
bool HasDPP8 = false;
bool HasDPALU_DPP = false;
+ bool HasDPPSrc1SGPR = false;
bool HasPackedFP32Ops = false;
bool HasImageInsts = false;
bool HasExtendedImageInsts = false;
@@ -916,6 +917,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDPALU_DPP;
}
+ bool hasDPPSrc1SGPR() const {
+ return HasDPPSrc1SGPR;
+ }
+
bool hasPackedFP32Ops() const {
return HasPackedFP32Ops;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index ea6df380153bb86..60a6964c754ff64 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
- field RegisterOperand Src1VOP3DPP = VRegSrc_32;
+ field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index da664c93d188963..6d0ad763d9e6cc1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureVOPD);
}
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
+ return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
+}
+
unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureKernargPreload);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d5092333c217127..297a69f54d63721 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
bool hasMAIInsts(const MCSubtargetInfo &STI);
bool hasVOPD(const MCSubtargetInfo &STI);
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
index 4d979e2cf74962c..a72bcbb529e7d9f 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 91e97a36a597a08..59649263f19a577 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
---
@@ -6,7 +7,8 @@
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
-# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: vop3
tracksRegLiveness: true
body: |
@@ -28,11 +30,53 @@ body: |
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
- ; should not be combined because src1 imm is illegal
+ ; should not be combined on subtargets where src1 imm is illegal
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
...
+# GCN-label: name: vop3_sgpr_src1
+# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+name: vop3_sgpr_src1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1
+
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:sgpr_32 = COPY $sgpr0
+ %3:sgpr_32 = COPY $sgpr1
+ %4:vgpr_32 = IMPLICIT_DEF
+
+ ; should be combined because src2 allows sgpr
+ %5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow sgpr for src1
+ %7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow sgpr for src1
+ %9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
+
+ ; should be combined only on subtargets that allow inlinable constants for src1
+ %11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
+
+ ; should not be combined when literal constants are used
+ %13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
+ %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
+...
+
# Regression test for src_modifiers on base u16 opcode
# GCN-label: name: vop3_u16
# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
new file mode 100644
index 000000000000000..ab7dda8485570d9
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s
@@ -0,0 +1,20 @@
+
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s
+
+//
+// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
+// constant.
+//
+
+v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
+
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index e475a1d5190777e..6a3a0ca40bfb9ba 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -45,6 +45,9 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a disassembler test. It might be permissible to accept sgpr as src1 on gfx11 as well as gfx1150 (i.e. disassembler is not strict), but we should at least ensure it is acceptable on gfx1150.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with couple nits.
…rgets In order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it.
bec805c
to
8af39e0
Compare
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Local branch amd-gfx 80abf82 Merged main:b4afade17564 into amd-gfx:259556a2c143 Remote branch main 2cd2445 [AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets (llvm#67461)
@mbrkusanin @Sisyph it seems like this works for integer instructions:
but not for floating point:
Is that a bug or is there some reason for it? |
A bug. Looks like class for src1_modifiers needs to be updated in tablegen, it differs for ints and floats. I'm working on a patch. |
…ng VOPC. Fixes support on GFX1150 and GFX12 where src1 of e64_dpp instructions should allow sgpr and imm operands. PR llvm#67461 added support for this with int operands, but it was missing a piece for float. Changing VOPC e64_dpp will be in a different patch because there is a bug preventing that change.
#87382) …ng VOPC. Fixes support on GFX1150 and GFX12 where src1 of e64_dpp instructions should allow sgpr and imm operands. PR #67461 added support for this with int operands, but it was missing a piece for float. Changing VOPC e64_dpp will be in a different patch because there is a bug preventing that change.
In order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it.