-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Verify f8f6f4 formats in assembler #117826
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/verify-f8f6f4-format-assembler
Nov 27, 2024
Merged
AMDGPU: Verify f8f6f4 formats in assembler #117826
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/verify-f8f6f4-format-assembler
Nov 27, 2024
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 27, 2024
Merged
This was referenced Nov 27, 2024
arsenm
requested review from
jayfoad,
pravinjagtap,
rampitec,
shiltian,
Sisyph and
srpande
November 27, 2024 00:55
arsenm
force-pushed
the
users/arsenm/gfx950/clang-size-check-global-load-lds
branch
from
November 27, 2024 01:04
bebc41c
to
ff1c112
Compare
arsenm
force-pushed
the
users/arsenm/gfx950/verify-f8f6f4-format-assembler
branch
from
November 27, 2024 01:04
6a5a2df
to
e9df017
Compare
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesVerify the register widths of the corresponding operands match Patch is 22.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117826.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 3ff588c4d0a452..37a8c7fef7d0af 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -4196,6 +4196,38 @@ bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0)
return true;
+ int BlgpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::blgp);
+ if (BlgpIdx != -1) {
+ if (const MFMA_F8F6F4_Info *Info = AMDGPU::isMFMA_F8F6F4(Opc)) {
+ int CbszIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::cbsz);
+
+ unsigned CBSZ = Inst.getOperand(CbszIdx).getImm();
+ unsigned BLGP = Inst.getOperand(BlgpIdx).getImm();
+
+ // Validate the correct register size was used for the floating point
+ // format operands
+
+ bool Success = true;
+ if (Info->NumRegsSrcA != mfmaScaleF8F6F4FormatToNumRegs(CBSZ)) {
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+ Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src0Idx).getReg()),
+ Operands),
+ "wrong register tuple size for cbsz value " + Twine(CBSZ));
+ Success = false;
+ }
+
+ if (Info->NumRegsSrcB != mfmaScaleF8F6F4FormatToNumRegs(BLGP)) {
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ Error(getRegLoc(mc2PseudoReg(Inst.getOperand(Src1Idx).getReg()),
+ Operands),
+ "wrong register tuple size for blgp value " + Twine(BLGP));
+ Success = false;
+ }
+
+ return Success;
+ }
+ }
+
const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (Src2Idx == -1)
return true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b27e2529f4a807..cddb87655ee8c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -3230,6 +3230,15 @@ def getMFMA_F8F6F4_WithSize : GenericTable {
let PrimaryKeyName = "getMFMA_F8F6F4_InstWithNumRegs" ;
}
+def isMFMA_F8F6F4Table : GenericTable {
+ let FilterClass = "MFMA_F8F6F4_WithSizeTable";
+ let CppTypeName = "MFMA_F8F6F4_Info";
+// let Fields = [ "Opcode" ];
+ let Fields = [ "Opcode", "F8F8Opcode", "NumRegsSrcA", "NumRegsSrcB" ];
+ let PrimaryKey = [ "Opcode" ];
+ let PrimaryKeyName = "isMFMA_F8F6F4" ;
+}
+
def FP8DstByteSelTable : GenericTable {
let FilterClass = "VOP3_Pseudo";
let CppTypeName = "FP8DstByteSelInfo";
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b233e898589396..ab5f0694c07f95 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -417,6 +417,7 @@ struct FP8DstByteSelInfo {
#define GET_WMMAOpcode3AddrMappingTable_IMPL
#define GET_getMFMA_F8F6F4_WithSize_DECL
#define GET_getMFMA_F8F6F4_WithSize_IMPL
+#define GET_isMFMA_F8F6F4Table_IMPL
#include "AMDGPUGenSearchableTables.inc"
int getMTBUFBaseOpcode(unsigned Opc) {
@@ -525,7 +526,7 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info ? Info->is_gfx940_xdl : false;
}
-static uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
+uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) {
switch (EncodingVal) {
case MFMAScaleFormats::FP6_E2M3:
case MFMAScaleFormats::FP6_E3M2:
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b0581711961b13..9f7fbec6a542f7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -96,6 +96,13 @@ struct MAIInstInfo {
bool is_gfx940_xdl;
};
+struct MFMA_F8F6F4_Info {
+ unsigned Opcode;
+ unsigned F8F8Opcode;
+ uint8_t NumRegsSrcA;
+ uint8_t NumRegsSrcB;
+};
+
#define GET_MIMGBaseOpcode_DECL
#define GET_MIMGDim_DECL
#define GET_MIMGEncoding_DECL
@@ -103,6 +110,8 @@ struct MAIInstInfo {
#define GET_MIMGMIPMapping_DECL
#define GET_MIMGBiASMapping_DECL
#define GET_MAIInstInfoTable_DECL
+#define GET_MAIInstInfoTable_DECL
+#define GET_isMFMA_F8F6F4Table_DECL
#include "AMDGPUGenSearchableTables.inc"
namespace IsaInfo {
@@ -581,12 +590,8 @@ unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST);
LLVM_READONLY
CanBeVOPD getCanBeVOPD(unsigned Opc);
-struct MFMA_F8F6F4_Info {
- unsigned Opcode;
- unsigned F8F8Opcode;
- uint8_t NumRegsSrcA;
- uint8_t NumRegsSrcB;
-};
+LLVM_READNONE
+uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal);
LLVM_READONLY
const MFMA_F8F6F4_Info *getMFMA_F8F6F4_WithFormatArgs(unsigned CBSZ,
diff --git a/llvm/test/MC/AMDGPU/mai-gfx950-err.s b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
index a6dff076392c85..e700b0b3cabfe4 100644
--- a/llvm/test/MC/AMDGPU/mai-gfx950-err.s
+++ b/llvm/test/MC/AMDGPU/mai-gfx950-err.s
@@ -29,3 +29,130 @@ v_mfma_ld_scale_b32 v0, v0 neg_hi:[0,1]
v_mfma_ld_scale_b32 v0, v0 neg_lo:[0,1] neg_hi:[0,1]
// CHECK: :[[@LINE-1]]:28: error: not a valid operand
+
+
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] cbsz:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] blgp:2
+// CHECK: :[[@LINE-1]]:46: error: wrong register tuple size for blgp value 2
+
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] cbsz:2 blgp:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+// CHECK: :[[@LINE-2]]:46: error: wrong register tuple size for blgp value 2
+
+
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] cbsz:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] blgp:2
+// CHECK: :[[@LINE-1]]:46: error: wrong register tuple size for blgp value 2
+
+v_mfma_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] cbsz:2 blgp:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+// CHECK: :[[@LINE-2]]:46: error: wrong register tuple size for blgp value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] v20, v21 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 blgp:2
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:19], a[0:3] v20, v21 blgp:2
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2
+
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] cbsz:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] blgp:2
+// CHECK: :[[@LINE-1]]:47: error: wrong register tuple size for blgp value 2
+
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] cbsz:2
+// CHECK: :[[@LINE-1]]:37: error: wrong register tuple size for cbsz value 2
+
+v_mfma_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] blgp:2
+// CHECK: :[[@LINE-1]]:47: error: wrong register tuple size for blgp value 2
+
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15] v32, v33 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], a[16:23], a[24:31], a[0:15] v32, v33 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 0
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:1
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 1
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:3
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3
+
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 0
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:1
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 1
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:2
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:15], v[0:3] v20, v21 blgp:3
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 0
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:1
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 1
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:2
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:7], a[12:19], a[0:3] v20, v21 cbsz:3
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 0
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:1
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 1
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:2
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 2
+
+v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], a[4:11], a[12:15], a[0:3] v20, v21 blgp:3
+// CHECK: :[[@LINE-1]]:52: error: wrong register tuple size for blgp value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[12:19], v[0:3] v20, v21 cbsz:3
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:3
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:7], v[0:3] v20, v21 blgp:3
+// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:3
+// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 3
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[12:19], v[0:3] v20, v21 cbsz:4
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 4
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[0:3] v20, v21 cbsz:4
+// CHECK: :[[@LINE-1]]:43: error: wrong register tuple size for cbsz value 4
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:9], v[0:3] v20, v21 blgp:4
+// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4
+
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[12:19], v[4:11], v[0:3] v20, v21 blgp:4
+// CHECK: :[[@LINE-1]]:53: error: wrong register tuple size for blgp value 4
diff --git a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
index 03b4041f77bcef..33665b655086c1 100644
--- a/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
+++ b/llvm/test/tools/llvm-mca/AMDGPU/gfx950.s
@@ -1,9 +1,9 @@
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx950 --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
# CHECK: Iterations: 1
-# CHECK: Instructions: 129
-# CHECK: Total Cycles: 1069
-# CHECK: Total uOps: 129
+# CHECK: Instructions: 133
+# CHECK: Total Cycles: 1101
+# CHECK: Total uOps: 133
v_mfma_f32_16x16x32_f16 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[0:3], a[4:7]
@@ -24,46 +24,37 @@ v_mfma_ld_scale_b32 v0, v0
;; TODO: test vdc/adc
v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:2
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:3
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:7], v[0:3] blgp:4
v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1
-v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[4:11], v[0:3] cbsz:4
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 blgp:1
+v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] cbsz:1 blgp:2
;; FIXME: should have different cycle count depending on whether either matrix is f8
v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:2
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:3
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:7], v[0:15] blgp:4
v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:3
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:4
-v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:2
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:3
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:7], v[4:11], v[0:15] cbsz:4
+v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15] cbsz:2
v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1
;; FIXME: should have different cycle count depending on whether either matrix is f8
v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5
-
-;; FIXME
-;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:1
-;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:2
-;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:3
-;; v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:4
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3], v5, v5 blgp:1
+v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:9], v[0:3], v5, v5 cbsz:2 blgp:2
v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5
-
-;; FIXME
-;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:1
-;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:2
-;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:3
-;; v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15], v5, v5 blgp:4
-
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:11], v[0:15], v5, v5 cbsz:2 blgp:1
+v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[4:9], v[4:9], v[0:15], v5, v5 cbsz:2 blgp:2
;; TODO: These results are wrong
v_smfmac_f32_16x16x64_f16 v[10:13], a[2:5], v[4:11], v3 cbsz:3 abid:1
@@ -213,28 +204,32 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
# CHECK-NEXT: - - - - 1.00 - - v_mfma_ld_scale_b32 v0, v0
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3]
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:1
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:2
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:3
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] blgp:4
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:2
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] blgp:3
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:7], v[0:3] blgp:4
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:3
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:4
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:2 blgp:1
-# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:11], v[0:3] cbsz:1 blgp:2
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:3
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:7], v[4:11], v[0:3] cbsz:4
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:9], v[4:11], v[0:3] cbsz:2 blgp:1
+# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[4:9], v[0:3] cbsz:1 blgp:2
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15]
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:1
-# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:2
-# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:3
-# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] blgp:4
+# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:2
+# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:9], v[0:15] blgp:3
+# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:7], v[0:15] blgp:4
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[4:11], v[4:11], v[0:15] cbsz:1
-# CHECK-NEXT: - ...
[truncated]
|
shiltian
approved these changes
Nov 27, 2024
arsenm
force-pushed
the
users/arsenm/gfx950/clang-size-check-global-load-lds
branch
from
November 27, 2024 04:18
ff1c112
to
185267f
Compare
arsenm
force-pushed
the
users/arsenm/gfx950/verify-f8f6f4-format-assembler
branch
from
November 27, 2024 04:18
e9df017
to
1d53ad5
Compare
This was referenced Nov 27, 2024
arsenm
force-pushed
the
users/arsenm/gfx950/clang-size-check-global-load-lds
branch
from
November 27, 2024 04:37
185267f
to
2083a51
Compare
Base automatically changed from
users/arsenm/gfx950/clang-size-check-global-load-lds
to
main
November 27, 2024 04:41
Verify the register widths of the corresponding operands match the floating point format expected size.
arsenm
force-pushed
the
users/arsenm/gfx950/verify-f8f6f4-format-assembler
branch
from
November 27, 2024 04:41
1d53ad5
to
2e790ba
Compare
arsenm
deleted the
users/arsenm/gfx950/verify-f8f6f4-format-assembler
branch
November 27, 2024 04:45
This was referenced Dec 2, 2024
searlmc1
pushed a commit
to ROCm/llvm-project
that referenced
this pull request
Feb 3, 2025
Verify the register widths of the corresponding operands match the floating point format expected size.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Verify the register widths of the corresponding operands match
the floating point format expected size.