Skip to content

Commit c1a6974

Browse files
authored
[AMDGPU][MC] Add GFX12 SMEM encoding (#75215)
1 parent f1ea77f commit c1a6974

File tree

10 files changed

+1644
-14
lines changed

10 files changed

+1644
-14
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,12 @@ def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
411411
"Has VGPR mode register indexing"
412412
>;
413413

414+
def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
415+
"HasScalarDwordx3Loads",
416+
"true",
417+
"Has 96-bit scalar load instructions"
418+
>;
419+
414420
def FeatureScalarStores : SubtargetFeature<"scalar-stores",
415421
"HasScalarStores",
416422
"true",
@@ -1462,7 +1468,8 @@ def FeatureISAVersion12 : FeatureSet<
14621468
FeatureVcmpxPermlaneHazard,
14631469
FeatureSALUFloatInsts,
14641470
FeatureVGPRSingleUseHintInsts,
1465-
FeatureMADIntraFwdBug]>;
1471+
FeatureMADIntraFwdBug,
1472+
FeatureScalarDwordx3Loads]>;
14661473

14671474
//===----------------------------------------------------------------------===//
14681475

@@ -2011,6 +2018,8 @@ def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
20112018

20122019
def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
20132020

2021+
def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
2022+
20142023
// Include AMDGPU TD files
20152024
include "SISchedule.td"
20162025
include "GCNProcessors.td"

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2635,7 +2635,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
26352635
if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
26362636
// SGPR and TTMP registers must be aligned.
26372637
// Max required alignment is 4 dwords.
2638-
AlignSize = std::min(RegWidth / 32, 4u);
2638+
AlignSize = std::min(llvm::bit_ceil(RegWidth / 32), 4u);
26392639
}
26402640

26412641
if (RegNum % AlignSize != 0) {

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
213213
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
214214
DECODE_OPERAND_REG_7(SReg_64, OPW64)
215215
DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
216+
DECODE_OPERAND_REG_7(SReg_96, OPW96)
216217
DECODE_OPERAND_REG_7(SReg_128, OPW128)
217218
DECODE_OPERAND_REG_7(SReg_256, OPW256)
218219
DECODE_OPERAND_REG_7(SReg_512, OPW512)
@@ -1239,6 +1240,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
12391240
case AMDGPU::TTMP_64RegClassID:
12401241
shift = 1;
12411242
break;
1243+
case AMDGPU::SGPR_96RegClassID:
1244+
case AMDGPU::TTMP_96RegClassID:
12421245
case AMDGPU::SGPR_128RegClassID:
12431246
case AMDGPU::TTMP_128RegClassID:
12441247
// ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
119119
bool HasFmaMixInsts = false;
120120
bool HasMovrel = false;
121121
bool HasVGPRIndexMode = false;
122+
bool HasScalarDwordx3Loads = false;
122123
bool HasScalarStores = false;
123124
bool HasScalarAtomics = false;
124125
bool HasSDWAOmod = false;
@@ -889,6 +890,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
889890
return getGeneration() >= VOLCANIC_ISLANDS;
890891
}
891892

893+
bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
894+
892895
bool hasScalarStores() const {
893896
return HasScalarStores;
894897
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
414414
// SGPR 64-bit registers
415415
def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
416416

417-
// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
417+
// SGPR 96-bit registers.
418418
def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">;
419419

420420
// SGPR 128-bit registers

llvm/lib/Target/AMDGPU/SMInstructions.td

Lines changed: 66 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
7474
bits<7> sdst;
7575
bits<32> offset;
7676
bits<8> soffset;
77-
bits<5> cpol;
77+
bits<5> cpol;
7878
}
7979

8080
class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -300,6 +300,8 @@ multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
300300
// does sdst for SMRD on SI/CI?
301301
defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
302302
defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
303+
let SubtargetPredicate = HasScalarDwordx3Loads in
304+
defm S_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_64, SReg_96>;
303305
defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
304306
defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>;
305307
defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
@@ -309,6 +311,8 @@ defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
309311
// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
310312
// SI/CI, bit disallowed for SMEM on VI.
311313
defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
314+
let SubtargetPredicate = HasScalarDwordx3Loads in
315+
defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>;
312316
defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
313317
defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
314318
defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
@@ -1179,7 +1183,7 @@ def SMInfoTable : GenericTable {
11791183
class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
11801184
SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
11811185
SGPR_NULL_gfx11plus> {
1182-
let AssemblerPredicate = isGFX11Plus;
1186+
let AssemblerPredicate = isGFX11Only;
11831187
let DecoderNamespace = "GFX11";
11841188
let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
11851189
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
@@ -1235,28 +1239,79 @@ defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>;
12351239
// GFX12.
12361240
//===----------------------------------------------------------------------===//
12371241

1238-
class SMEM_Real_gfx12<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
1239-
SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX12,
1240-
SGPR_NULL_gfx11plus> {
1242+
class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
1243+
int subtarget, RegisterWithSubRegs sgpr_null> :
1244+
SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {
1245+
1246+
let Inst{18-13} = op;
1247+
let Inst{31-26} = 0x3d;
1248+
1249+
let Inst{55-32} = !if(ps.has_offset, offset{23-0}, !if(ps.has_soffset, 0, ?));
1250+
let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
1251+
!if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
1252+
}
1253+
1254+
class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
1255+
SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
1256+
SGPR_NULL_gfx11plus> {
12411257
let AssemblerPredicate = isGFX12Plus;
12421258
let DecoderNamespace = "GFX12";
1243-
let Inst{18-13} = op{5-0};
1244-
let Inst{19} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
1245-
let Inst{24-20} = ?; // TODO-GFX12: Add new bits {24-20}: TH, Scope, NV
1246-
let Inst{25} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
1247-
let Inst{55-32} = offset{23-0};
1259+
1260+
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
1261+
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
12481262
}
12491263

1250-
class SMEM_Real_Prefetch_gfx12 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx12<op, ps> {
1264+
class SMEM_Real_Prefetch_gfx12<bits<6> op, SM_Pseudo ps> :
1265+
SMEM_Real_gfx12<op, ps> {
12511266
bits<7> sdata; // Only 5 bits of sdata are supported.
12521267

12531268
let sdst = ?;
12541269
let Inst{12-11} = 0; // Unused sdata bits.
12551270
let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?);
12561271
}
12571272

1273+
class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offsets> :
1274+
SMEM_Real_gfx12<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
1275+
RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
1276+
let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
1277+
1278+
let Inst{22-21} = cpol{4-3}; // scope
1279+
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
1280+
}
1281+
1282+
multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
1283+
defvar opName = !tolower(NAME);
1284+
def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>;
1285+
def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>;
1286+
}
1287+
1288+
defm S_LOAD_B32 : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">;
1289+
defm S_LOAD_B64 : SM_Real_Loads_gfx12<0x01, "S_LOAD_DWORDX2">;
1290+
defm S_LOAD_B96 : SM_Real_Loads_gfx12<0x05, "S_LOAD_DWORDX3">;
1291+
defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;
1292+
defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;
1293+
defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">;
1294+
1295+
defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;
1296+
defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;
1297+
defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">;
1298+
defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;
1299+
defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;
1300+
defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">;
1301+
1302+
def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;
1303+
12581304
def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>;
12591305
def S_PREFETCH_INST_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x25, S_PREFETCH_INST_PC_REL>;
12601306
def S_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x26, S_PREFETCH_DATA>;
12611307
def S_BUFFER_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x27, S_BUFFER_PREFETCH_DATA>;
12621308
def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DATA_PC_REL>;
1309+
1310+
multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
1311+
defvar ps = NAME;
1312+
def _IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
1313+
def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
1314+
}
1315+
1316+
defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>;
1317+
defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;

llvm/test/MC/AMDGPU/gfx11_asm_err.s

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,9 @@ scratch_store_b128 off, v[2:5], s0 offset:8000000
158158

159159
flat_atomic_add_f32 v1, v[0:1], v2 offset:-1
160160
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: expected a 12-bit unsigned offset
161+
162+
s_load_b96 s[20:22], s[2:3], s0
163+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
164+
165+
s_buffer_load_b96 s[20:22], s[4:7], s0
166+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

0 commit comments

Comments
 (0)