Skip to content

[AMDGPU][MC] Add GFX12 SMEM encoding #75215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 15, 2023
Merged

Conversation

mbrkusanin
Copy link
Collaborator

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Dec 12, 2023

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-mc

Author: Mirko Brkušanin (mbrkusanin)

Changes

Patch is 100.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/75215.diff

10 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+10-1)
  • (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+37-5)
  • (modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+3)
  • (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3)
  • (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1-1)
  • (modified) llvm/lib/Target/AMDGPU/SMInstructions.td (+65-10)
  • (modified) llvm/test/MC/AMDGPU/gfx11_asm_err.s (+6)
  • (modified) llvm/test/MC/AMDGPU/gfx12_asm_smem.s (+810)
  • (modified) llvm/test/MC/AMDGPU/gfx12_err.s (+9)
  • (modified) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_smem.txt (+735)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 799e102d56174d..920cf784858768 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -411,6 +411,12 @@ def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
   "Has VGPR mode register indexing"
 >;
 
+def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
+  "HasScalarDwordx3Loads",
+  "true",
+  "Has 96-bit scalar load instructions"
+>;
+
 def FeatureScalarStores : SubtargetFeature<"scalar-stores",
   "HasScalarStores",
   "true",
@@ -1462,7 +1468,8 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
    FeatureVGPRSingleUseHintInsts,
-   FeatureMADIntraFwdBug]>;
+   FeatureMADIntraFwdBug,
+   FeatureScalarDwordx3Loads]>;
 
 //===----------------------------------------------------------------------===//
 
@@ -2011,6 +2018,8 @@ def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
 
 def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
 
+def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 92427335c0ad2f..050ecb93ef16d6 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1665,6 +1665,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   SMLoc getInstLoc(const OperandVector &Operands) const;
 
   bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
+  bool validateOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSOPLiteral(const MCInst &Inst) const;
@@ -2630,7 +2631,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
   if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
     // SGPR and TTMP registers must be aligned.
     // Max required alignment is 4 dwords.
-    AlignSize = std::min(RegWidth / 32, 4u);
+    AlignSize = std::min(llvm::bit_ceil(RegWidth / 32), 4u);
   }
 
   if (RegNum % AlignSize != 0) {
@@ -4139,6 +4140,40 @@ SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
   return getLoc();
 }
 
+bool AMDGPUAsmParser::validateOffset(const MCInst &Inst,
+                                     const OperandVector &Operands) {
+  auto Opcode = Inst.getOpcode();
+  auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+  if (OpNum == -1)
+    return true;
+
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if ((TSFlags & SIInstrFlags::FLAT))
+    return validateFlatOffset(Inst, Operands);
+
+  if ((TSFlags & SIInstrFlags::SMRD))
+    return validateSMEMOffset(Inst, Operands);
+
+  const auto &Op = Inst.getOperand(OpNum);
+  if (isGFX12Plus() &&
+      (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) {
+    const unsigned OffsetSize = 24;
+    if (!isIntN(OffsetSize, Op.getImm())) {
+      Error(getFlatOffsetLoc(Operands),
+            Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
+      return false;
+    }
+  } else {
+    const unsigned OffsetSize = 16;
+    if (!isUIntN(OffsetSize, Op.getImm())) {
+      Error(getFlatOffsetLoc(Operands),
+            Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset");
+      return false;
+    }
+  }
+  return true;
+}
+
 bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
                                          const OperandVector &Operands) {
   uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
@@ -4796,10 +4831,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateMovrels(Inst, Operands)) {
     return false;
   }
-  if (!validateFlatOffset(Inst, Operands)) {
-    return false;
-  }
-  if (!validateSMEMOffset(Inst, Operands)) {
+  if (!validateOffset(Inst, Operands)) {
     return false;
   }
   if (!validateMAIAccWrite(Inst, Operands)) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ed019d26c1dfd8..4e3561aeaf93aa 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -206,6 +206,7 @@ DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
 DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
 DECODE_OPERAND_REG_7(SReg_64, OPW64)
 DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
+DECODE_OPERAND_REG_7(SReg_96, OPW96)
 DECODE_OPERAND_REG_7(SReg_128, OPW128)
 DECODE_OPERAND_REG_7(SReg_256, OPW256)
 DECODE_OPERAND_REG_7(SReg_512, OPW512)
@@ -1232,6 +1233,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
   case AMDGPU::TTMP_64RegClassID:
     shift = 1;
     break;
+  case AMDGPU::SGPR_96RegClassID:
+  case AMDGPU::TTMP_96RegClassID:
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::TTMP_128RegClassID:
   // ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 94b9e49b765a6f..39cb69685712f5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -119,6 +119,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasFmaMixInsts = false;
   bool HasMovrel = false;
   bool HasVGPRIndexMode = false;
+  bool HasScalarDwordx3Loads = false;
   bool HasScalarStores = false;
   bool HasScalarAtomics = false;
   bool HasSDWAOmod = false;
@@ -884,6 +885,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
+  bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
+
   bool hasScalarStores() const {
     return HasScalarStores;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 7ea2280c474b05..981da13fe08952 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -414,7 +414,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
 // SGPR 64-bit registers
 def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
 
-// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
+// SGPR 96-bit registers.
 def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">;
 
 // SGPR 128-bit registers
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index c18846483cf95a..f603106b21754d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -74,7 +74,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
   bits<7>  sdst;
   bits<32> offset;
   bits<8>  soffset;
-  bits<5> cpol;
+  bits<5>  cpol;
 }
 
 class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -300,6 +300,8 @@ multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
 // does sdst for SMRD on SI/CI?
 defm S_LOAD_DWORD    : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
 defm S_LOAD_DWORDX2  : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
+let SubtargetPredicate = HasScalarDwordx3Loads in
+  defm S_LOAD_DWORDX3  : SM_Pseudo_Loads <SReg_64, SReg_96>;
 defm S_LOAD_DWORDX4  : SM_Pseudo_Loads <SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8  : SM_Pseudo_Loads <SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
@@ -309,6 +311,8 @@ defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
 // FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
 // SI/CI, bit disallowed for SMEM on VI.
 defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
+let SubtargetPredicate = HasScalarDwordx3Loads in
+  defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>;
 defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
 defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
 defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
@@ -1179,7 +1183,7 @@ def SMInfoTable : GenericTable {
 class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
     SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
                             SGPR_NULL_gfx11plus> {
-  let AssemblerPredicate = isGFX11Plus;
+  let AssemblerPredicate = isGFX11Only;
   let DecoderNamespace = "GFX11";
   let Inst{13}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
   let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
@@ -1235,19 +1239,30 @@ defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>;
 // GFX12.
 //===----------------------------------------------------------------------===//
 
-class SMEM_Real_gfx12<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
-    SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX12,
+class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
+                          int subtarget, RegisterWithSubRegs sgpr_null> :
+    SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {
+
+  let Inst{18-13} = op;
+  let Inst{31-26} = 0x3d;
+
+  let Inst{55-32} = !if(ps.has_offset, offset{23-0}, !if(ps.has_soffset, 0, ?));
+  let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
+                        !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
+}
+
+class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+    SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
                             SGPR_NULL_gfx11plus> {
   let AssemblerPredicate = isGFX12Plus;
   let DecoderNamespace = "GFX12";
-  let Inst{18-13} = op{5-0};
-  let Inst{19}    = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
-  let Inst{24-20} = ?; // TODO-GFX12: Add new bits {24-20}: TH, Scope, NV
-  let Inst{25}    = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
-  let Inst{55-32} = offset{23-0};
+
+  let Inst{5-0}   = !if(ps.has_sbase, sbase{6-1}, ?);
+  let Inst{12-6}  = !if(ps.has_sdst, sdst{6-0}, ?);
 }
 
-class SMEM_Real_Prefetch_gfx12 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx12<op, ps> {
+class SMEM_Real_Prefetch_gfx12<bits<6> op, SM_Pseudo ps> :
+    SMEM_Real_gfx12<op, ps> {
   bits<7> sdata; // Only 5 bits of sdata are supported.
 
   let sdst = ?;
@@ -1255,8 +1270,48 @@ class SMEM_Real_Prefetch_gfx12 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx12<op,
   let Inst{10-6}  = !if(ps.has_sdst, sdata{4-0}, ?);
 }
 
+class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offsets> :
+    SMEM_Real_gfx12<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+
+  let Inst{22-21} = cpol{4-3}; // scope
+  let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+}
+
+multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
+  defvar opName = !tolower(NAME);
+  def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>;
+  def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>;
+}
+
+defm S_LOAD_B32  : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">;
+defm S_LOAD_B64  : SM_Real_Loads_gfx12<0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_B96  : SM_Real_Loads_gfx12<0x05, "S_LOAD_DWORDX3">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">;
+
+defm S_BUFFER_LOAD_B32  : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_B64  : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_B96  : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">;
+
+def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;
+
 def S_PREFETCH_INST_gfx12        : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>;
 def S_PREFETCH_INST_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x25, S_PREFETCH_INST_PC_REL>;
 def S_PREFETCH_DATA_gfx12        : SMEM_Real_Prefetch_gfx12<0x26, S_PREFETCH_DATA>;
 def S_BUFFER_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x27, S_BUFFER_PREFETCH_DATA>;
 def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DATA_PC_REL>;
+
+multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
+  defvar ps = NAME;
+  def _IMM_gfx12      : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+  def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
+}
+
+defm S_ATC_PROBE        : SMEM_Real_Probe_gfx12<0x22>;
+defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 5e508b466e830b..e6b32b80e01769 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -152,3 +152,9 @@ v_fmac_f32_e64_dpp v5, v2, 0x1234 quad_perm:[3,2,1,0]
 
 s_load_dword s1, s[2:3], s0 0x1
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+s_load_b96 s[20:22], s[2:3], s0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
+s_buffer_load_b96 s[20:22], s[4:7], s0
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
index ed7ad5bb0c4e82..1566b9c04e3494 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_smem.s
@@ -33,3 +33,813 @@ s_buffer_prefetch_data s[20:23], 100, s10, 7
 
 s_buffer_prefetch_data s[20:23], 100, null, 7
 // GFX12: s_buffer_prefetch_data s[20:23], 0x64, null, 7 ; encoding: [0xca,0xe1,0x04,0xf4,0x64,0x00,0x00,0xf8]
+
+s_load_b32 s5, s[2:3], s0
+// GFX12: s_load_b32 s5, s[2:3], s0 offset:0x0    ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 s101, s[2:3], s0
+// GFX12: s_load_b32 s101, s[2:3], s0 offset:0x0  ; encoding: [0x41,0x19,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 vcc_lo, s[2:3], s0
+// GFX12: s_load_b32 vcc_lo, s[2:3], s0 offset:0x0 ; encoding: [0x81,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 vcc_hi, s[2:3], s0
+// GFX12: s_load_b32 vcc_hi, s[2:3], s0 offset:0x0 ; encoding: [0xc1,0x1a,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 s5, s[4:5], s0
+// GFX12: s_load_b32 s5, s[4:5], s0 offset:0x0    ; encoding: [0x42,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 s5, s[100:101], s0
+// GFX12: s_load_b32 s5, s[100:101], s0 offset:0x0 ; encoding: [0x72,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 s5, vcc, s0
+// GFX12: s_load_b32 s5, vcc, s0 offset:0x0       ; encoding: [0x75,0x01,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b32 s5, s[2:3], s101
+// GFX12: s_load_b32 s5, s[2:3], s101 offset:0x0  ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xca]
+
+s_load_b32 s5, s[2:3], vcc_lo
+// GFX12: s_load_b32 s5, s[2:3], vcc_lo offset:0x0 ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd4]
+
+s_load_b32 s5, s[2:3], vcc_hi
+// GFX12: s_load_b32 s5, s[2:3], vcc_hi offset:0x0 ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xd6]
+
+s_load_b32 s5, s[2:3], m0
+// GFX12: s_load_b32 s5, s[2:3], m0 offset:0x0    ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xfa]
+
+s_load_b32 s5, s[2:3], 0x0
+// GFX12: s_load_b32 s5, s[2:3], 0x0              ; encoding: [0x41,0x01,0x00,0xf4,0x00,0x00,0x00,0xf8]
+
+s_load_b32 s5, s[2:3], s7 offset:0x12345
+// GFX12: s_load_b32 s5, s[2:3], s7 offset:0x12345 ; encoding: [0x41,0x01,0x00,0xf4,0x45,0x23,0x01,0x0e]
+
+s_load_b64 s[10:11], s[2:3], s0
+// GFX12: s_load_b64 s[10:11], s[2:3], s0 offset:0x0 ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[12:13], s[2:3], s0
+// GFX12: s_load_b64 s[12:13], s[2:3], s0 offset:0x0 ; encoding: [0x01,0x23,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[100:101], s[2:3], s0
+// GFX12: s_load_b64 s[100:101], s[2:3], s0 offset:0x0 ; encoding: [0x01,0x39,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 vcc, s[2:3], s0
+// GFX12: s_load_b64 vcc, s[2:3], s0 offset:0x0   ; encoding: [0x81,0x3a,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[10:11], s[4:5], s0
+// GFX12: s_load_b64 s[10:11], s[4:5], s0 offset:0x0 ; encoding: [0x82,0x22,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[10:11], s[100:101], s0
+// GFX12: s_load_b64 s[10:11], s[100:101], s0 offset:0x0 ; encoding: [0xb2,0x22,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[10:11], vcc, s0
+// GFX12: s_load_b64 s[10:11], vcc, s0 offset:0x0 ; encoding: [0xb5,0x22,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b64 s[10:11], s[2:3], s101
+// GFX12: s_load_b64 s[10:11], s[2:3], s101 offset:0x0 ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0xca]
+
+s_load_b64 s[10:11], s[2:3], vcc_lo
+// GFX12: s_load_b64 s[10:11], s[2:3], vcc_lo offset:0x0 ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0xd4]
+
+s_load_b64 s[10:11], s[2:3], vcc_hi
+// GFX12: s_load_b64 s[10:11], s[2:3], vcc_hi offset:0x0 ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0xd6]
+
+s_load_b64 s[10:11], s[2:3], m0
+// GFX12: s_load_b64 s[10:11], s[2:3], m0 offset:0x0 ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0xfa]
+
+s_load_b64 s[10:11], s[2:3], 0x0
+// GFX12: s_load_b64 s[10:11], s[2:3], 0x0        ; encoding: [0x81,0x22,0x00,0xf4,0x00,0x00,0x00,0xf8]
+
+s_load_b96 s[20:22], s[2:3], s0
+// GFX12: s_load_b96 s[20:22], s[2:3], s0 offset:0x0 ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[24:26], s[2:3], s0
+// GFX12: s_load_b96 s[24:26], s[2:3], s0 offset:0x0 ; encoding: [0x01,0xa6,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[96:98], s[2:3], s0
+// GFX12: s_load_b96 s[96:98], s[2:3], s0 offset:0x0 ; encoding: [0x01,0xb8,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[20:22], s[4:5], s0
+// GFX12: s_load_b96 s[20:22], s[4:5], s0 offset:0x0 ; encoding: [0x02,0xa5,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[20:22], s[100:101], s0
+// GFX12: s_load_b96 s[20:22], s[100:101], s0 offset:0x0 ; encoding: [0x32,0xa5,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[20:22], vcc, s0
+// GFX12: s_load_b96 s[20:22], vcc, s0 offset:0x0 ; encoding: [0x35,0xa5,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b96 s[20:22], s[2:3], s101
+// GFX12: s_load_b96 s[20:22], s[2:3], s101 offset:0x0 ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0xca]
+
+s_load_b96 s[20:22], s[2:3], vcc_lo
+// GFX12: s_load_b96 s[20:22], s[2:3], vcc_lo offset:0x0 ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0xd4]
+
+s_load_b96 s[20:22], s[2:3], vcc_hi
+// GFX12: s_load_b96 s[20:22], s[2:3], vcc_hi offset:0x0 ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0xd6]
+
+s_load_b96 s[20:22], s[2:3], m0
+// GFX12: s_load_b96 s[20:22], s[2:3], m0 offset:0x0 ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0xfa]
+
+s_load_b96 s[20:22], s[2:3], 0x0
+// GFX12: s_load_b96 s[20:22], s[2:3], 0x0       ; encoding: [0x01,0xa5,0x00,0xf4,0x00,0x00,0x00,0xf8]
+
+s_load_b128 s[20:23], s[2:3], s0
+// GFX12: s_load_b128 s[20:23], s[2:3], s0 offset:0x0 ; encoding: [0x01,0x45,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[24:27], s[2:3], s0
+// GFX12: s_load_b128 s[24:27], s[2:3], s0 offset:0x0 ; encoding: [0x01,0x46,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[96:99], s[2:3], s0
+// GFX12: s_load_b128 s[96:99], s[2:3], s0 offset:0x0 ; encoding: [0x01,0x58,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[20:23], s[4:5], s0
+// GFX12: s_load_b128 s[20:23], s[4:5], s0 offset:0x0 ; encoding: [0x02,0x45,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[20:23], s[100:101], s0
+// GFX12: s_load_b128 s[20:23], s[100:101], s0 offset:0x0 ; encoding: [0x32,0x45,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[20:23], vcc, s0
+// GFX12: s_load_b128 s[20:23], vcc, s0 offset:0x0 ; encoding: [0x35,0x45,0x00,0xf4,0x00,0x00,0x00,0x00]
+
+s_load_b128 s[20:23], s[2:3], s101
+// GFX12: s_load_b128 s[20:23], s[2:3], s101 offset:0x0 ; encoding: [0x01,0x45,0x00,0xf4,0x00,0x00,0x00,0xca]
+
+s_load_b128 s[20:23], s[2:3], vcc_lo
+// GF...
[truncated]

Copy link
Collaborator

@piotrAMD piotrAMD left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@mbrkusanin mbrkusanin merged commit c1a6974 into llvm:main Dec 15, 2023
@mbrkusanin mbrkusanin deleted the gfx12-smem branch January 19, 2024 09:32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants