Skip to content

Commit 2973feb

Browse files
authored
[AMDGPU] Force the third source operand of the MAI instructions to VGPR if no AGPRs are used. (#69720)
eaf85b9c28 "[AMDGPU] Select VGPR versions of MFMA if possible" prevents the compiler from reserving AGPRs if a kernel has no inline asm explicitly using AGPRs, no calls, and runs at least 2 waves with not more than 256 VGPRs. This, in turn, makes it impossible to allocate AGPR if necessary. As a result, regalloc fails in case we have an MAI instruction that has at least one AGPR operand. This change checks if we have AGPRs and forces operands to VGPR if we do not have them. --------- Co-authored-by: Alexander Timofeev <alexander.timofeev@amd.com>
1 parent 25da9bb commit 2973feb

File tree

2 files changed

+66
-3
lines changed

2 files changed

+66
-3
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14136,7 +14136,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
1413614136
SDNode *Node) const {
1413714137
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
1413814138

14139-
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
14139+
MachineFunction *MF = MI.getParent()->getParent();
14140+
MachineRegisterInfo &MRI = MF->getRegInfo();
14141+
SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1414014142

1414114143
if (TII->isVOP3(MI.getOpcode())) {
1414214144
// Make sure constant bus requirements are respected.
@@ -14147,11 +14149,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
1414714149
// use between vgpr and agpr as agpr tuples tend to be big.
1414814150
if (!MI.getDesc().operands().empty()) {
1414914151
unsigned Opc = MI.getOpcode();
14152+
bool HasAGPRs = Info->mayNeedAGPRs();
1415014153
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
14151-
for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
14152-
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
14154+
int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
14155+
for (auto I :
14156+
{AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
14157+
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
1415314158
if (I == -1)
1415414159
break;
14160+
if ((I == Src2Idx) && (HasAGPRs))
14161+
break;
1415514162
MachineOperand &Op = MI.getOperand(I);
1415614163
if (!Op.isReg() || !Op.getReg().isVirtual())
1415714164
continue;
@@ -14169,6 +14176,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
1416914176
MRI.setRegClass(Op.getReg(), NewRC);
1417014177
}
1417114178

14179+
if (!HasAGPRs)
14180+
return;
14181+
1417214182
// Resolve the rest of AV operands to AGPRs.
1417314183
if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
1417414184
if (Src2->isReg() && Src2->getReg().isVirtual()) {
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX940 %s
3+
4+
5+
define protected amdgpu_kernel void @test(ptr addrspace(1) %in, ptr addrspace(1) %out) {
6+
; GFX940-LABEL: test:
7+
; GFX940: ; %bb.0: ; %entry
8+
; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
9+
; GFX940-NEXT: v_mov_b32_e32 v0, 0
10+
; GFX940-NEXT: v_mov_b32_e32 v2, v0
11+
; GFX940-NEXT: v_mov_b32_e32 v3, v0
12+
; GFX940-NEXT: v_mov_b32_e32 v1, v0
13+
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
14+
; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
15+
; GFX940-NEXT: v_mov_b64_e32 v[10:11], v[2:3]
16+
; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[0:1]
17+
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
18+
; GFX940-NEXT: v_mov_b32_e32 v12, s4
19+
; GFX940-NEXT: v_mov_b32_e32 v13, s5
20+
; GFX940-NEXT: v_mov_b32_e32 v4, s6
21+
; GFX940-NEXT: v_mov_b32_e32 v5, s7
22+
; GFX940-NEXT: v_mov_b32_e32 v6, s7
23+
; GFX940-NEXT: v_mov_b32_e32 v7, s7
24+
; GFX940-NEXT: s_nop 1
25+
; GFX940-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[12:13], v[4:7], v13
26+
; GFX940-NEXT: s_nop 6
27+
; GFX940-NEXT: global_store_dword v0, v11, s[2:3] offset:12 sc0 sc1
28+
; GFX940-NEXT: s_endpgm
29+
entry:
30+
%arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 0
31+
%arrayidx1 = getelementptr inbounds i32, ptr addrspace(1) %in, i64 1
32+
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i64 2
33+
%arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %in, i64 3
34+
%0 = load i32, ptr addrspace(1) %arrayidx
35+
%1 = load i32, ptr addrspace(1) %arrayidx1
36+
%2 = load i32, ptr addrspace(1) %arrayidx2
37+
%3 = load i32, ptr addrspace(1) %arrayidx3
38+
%src1.0 = insertelement <2 x i32> undef, i32 %0, i64 0
39+
%src1 = insertelement <2 x i32> %src1.0, i32 %1, i64 1
40+
%src2.0 = insertelement <4 x i32> undef, i32 %2, i64 0
41+
%src2.1 = insertelement <4 x i32> %src2.0, i32 %3, i64 1
42+
%src2.2 = insertelement <4 x i32> %src2.1, i32 %3, i64 2
43+
%src2 = insertelement <4 x i32> %src2.2, i32 %3, i64 3
44+
%4 = tail call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %src1, <4 x i32> %src2, <4 x i32> zeroinitializer, i32 %1, i32 0, i32 0)
45+
%vecext = extractelement <4 x i32> %4, i64 0
46+
%vecext.1 = extractelement <4 x i32> %4, i64 1
47+
%vecext.2 = extractelement <4 x i32> %4, i64 2
48+
%vecext.3 = extractelement <4 x i32> %4, i64 3
49+
%arrayidx4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 3
50+
store i32 %vecext.3, ptr addrspace(1) %arrayidx4
51+
ret void
52+
}
53+
declare <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32>, <4 x i32>, <4 x i32>, i32, i32 immarg, i32 immarg)

0 commit comments

Comments
 (0)