-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
Conversation
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesAllows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10. Full diff: https://github.com/llvm/llvm-project/pull/144782.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..c459c4df11e9e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -2504,6 +2505,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
+ multiclass VOP2_Real_dpp_gfx9<bits<6> op> {
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>;
+ }
+
} // End DecoderNamespace = "GFX9"
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>;
} // End AssemblerPredicate = isGFX9Only
defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index 3dcf288bbbaa5..bbd36a9140b96 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,15 @@ v_pk_fmac_f16 v5, -4.0, v2
v_pk_fmac_f16 v5, v1, v255
// GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16 v5, v1, v2
+// GFX10: encoding: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+
+v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX10: encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
new file mode 100644
index 0000000000000..f7dab2d0359dc
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+
+v_pk_fmac_f16 v5, v1, v2
+// CHECK-MI: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
index 1774efa4a65c7..4a7471e6c6f98 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
@@ -2476,3 +2476,10 @@
# W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
# W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
index 40b8f24e4d72f..233f93a5b8e7d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
@@ -222,3 +222,6 @@
# W32: v_cndmask_b32_dpp v0, v1, v2, vcc_lo dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
# W64: v_cndmask_b32_dpp v0, v1, v2, vcc dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
new file mode 100644
index 0000000000000..ac1ef4baa9aa4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s
+
+# CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2
+0x01,0x05,0x0a,0x78
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
|
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.
This would create some additional instructions though, i.e., for GFX9, V_PK_FMAC_F16_e32_gfx9
, V_PK_FMAC_F16_e64_gfx9
, V_PK_FMAC_F16_sdwa_gfx9
, and for GFX10, V_PK_FMAC_F16_e64_gfx10
, V_PK_FMAC_F16_sdwa_gfx10
. Would this be a problem?
Also, here (Line 2173) VOP2_Real_e32_gfx10
is instantiated with a predicate on Line 2172, but VOP2_Real_gfx10
does not have this predicate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?
For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here, VOP2_Real_e32e64_gfx9 shall define all forms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code updated as suggested.
let IsSingle = 1 in { | ||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | ||
} | ||
defm V_PK_FMAC_F16 : VOP2_Real_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line above.
@@ -2560,6 +2558,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line and align the column.
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?
For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.