Skip to content

[AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets #67461

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
"Support DPP (Data Parallel Primitives) extension in DP ALU"
>;

def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
"HasDPPSrc1SGPR",
"true",
"Support SGPR for Src1 of DPP instructions"
>;

def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
"HasPackedFP32Ops",
"true",
Expand Down Expand Up @@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<

def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts])>;
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR])>;

def FeatureISAVersion11_5_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[FeatureSALUFloatInsts,
FeatureDPPSrc1SGPR,
FeatureGFX11FullVGPRs])>;

//===----------------------------------------------------------------------===//
Expand Down
35 changes: 26 additions & 9 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
if (DppCtrlIdx < 0)
return true;
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
if (DppCtrlIdx >= 0) {
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();

if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
// DP ALU DPP is supported for row_newbcast only on GFX9*
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
Error(S, "DP ALU dpp only supports row_newbcast");
return false;
}
}

if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
// DP ALU DPP is supported for row_newbcast only on GFX9*
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
Error(S, "DP ALU dpp only supports row_newbcast");
return false;
int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;

if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
if (Src1Idx >= 0) {
const MCOperand &Src1 = Inst.getOperand(Src1Idx);
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
if (Src1.isImm() ||
(Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
Error(Op.getStartLoc(), "invalid operand for instruction");
return false;
}
}
}

return true;
Expand Down
23 changes: 22 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
return &OldOpnd;
}

static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
MachineRegisterInfo &MRI) {
int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
if (RegClass == -1)
return 0;

const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
}

MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
Expand Down Expand Up @@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
int Src0Idx = NumOperands;
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
Fail = true;
Expand All @@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
int OpNum = NumOperands;
// If subtarget does not support SGPRs for src1 operand then the
// requirements are the same as for src0. We check src0 instead because
// pseudos are shared between subtargets and allow SGPR for src1 on all.
if (!ST->hasDPPSrc1SGPR()) {
assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
getOperandSize(*DPPInst, NumOperands, *MRI) &&
"Src0 and Src1 operands should have the same size");
OpNum = Src0Idx;
}
if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
break;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDPP = false;
bool HasDPP8 = false;
bool HasDPALU_DPP = false;
bool HasDPPSrc1SGPR = false;
bool HasPackedFP32Ops = false;
bool HasImageInsts = false;
bool HasExtendedImageInsts = false;
Expand Down Expand Up @@ -916,6 +917,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDPALU_DPP;
}

bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }

bool hasPackedFP32Ops() const {
return HasPackedFP32Ops;
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
field RegisterOperand Src1VOP3DPP = VRegSrc_32;
field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureVOPD);
}

bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
}

unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureKernargPreload);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
bool hasMAIInsts(const MCSubtargetInfo &STI);
bool hasVOPD(const MCSubtargetInfo &STI);
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
unsigned hasKernargPreload(const MCSubtargetInfo &STI);

Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/dpp_combine.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN

; GCN-LABEL: {{^}}dpp_add:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
Expand Down
52 changes: 49 additions & 3 deletions llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150

---

# GCN-label: name: vop3
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
name: vop3
tracksRegLiveness: true
body: |
Expand All @@ -28,10 +30,54 @@ body: |
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec

; should not be combined because src1 imm is illegal
; should not be combined on subtargets where src1 imm is illegal
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
...
---

# GCN-label: name: vop3_sgpr_src1
# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
name: vop3_sgpr_src1
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1

%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
%2:sgpr_32 = COPY $sgpr0
%3:sgpr_32 = COPY $sgpr1
%4:vgpr_32 = IMPLICIT_DEF

; should be combined because src2 allows sgpr
%5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
%6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec

; should be combined only on subtargets that allow sgpr for src1
%7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
%8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec

; should be combined only on subtargets that allow sgpr for src1
%9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec

; should be combined only on subtargets that allow inlinable constants for src1
%11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec

; should not be combined when literal constants are used
%13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
%14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
...
---

# Regression test for src_modifiers on base u16 opcode
# GCN-label: name: vop3_u16
Expand Down
19 changes: 19 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1150_asm_features.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s

//
// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
// constant.
//

v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]

v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]

v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]

v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
9 changes: 9 additions & 0 deletions llvm/test/MC/AMDGPU/gfx11_asm_err.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

Expand Down
13 changes: 13 additions & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_features.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s

# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff

# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff

# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05

# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05