Skip to content

Commit 48f1915

Browse files
author
Abhinav Garg
committed
Add register bank legalization for G_FADD
1 parent d09b505 commit 48f1915

File tree

4 files changed

+263
-1
lines changed

4 files changed

+263
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
837837
return LLT::scalar(32);
838838
case Sgpr64:
839839
case Vgpr64:
840+
case UniInVgprS64:
840841
return LLT::scalar(64);
841842
case Sgpr128:
842843
case Vgpr128:
@@ -960,6 +961,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
960961
case UniInVcc:
961962
case UniInVgprS16:
962963
case UniInVgprS32:
964+
case UniInVgprS64:
963965
case UniInVgprV2S16:
964966
case UniInVgprV4S32:
965967
case UniInVgprB32:
@@ -1092,6 +1094,7 @@ void RegBankLegalizeHelper::applyMappingDst(
10921094
break;
10931095
}
10941096
case UniInVgprS32:
1097+
case UniInVgprS64:
10951098
case UniInVgprV2S16:
10961099
case UniInVgprV4S32: {
10971100
assert(Ty == getTyFromID(MethodIDs[OpIdx]));

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -906,9 +906,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
906906
bool hasSALUFloat = ST->hasSALUFloatInsts();
907907

908908
addRulesForGOpcs({G_FADD}, Standard)
909+
.Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
910+
.Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
911+
.Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
909912
.Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
910913
.Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
911-
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
914+
.Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
915+
.Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
916+
.Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
917+
.Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}})
918+
.Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
919+
.Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}})
920+
.Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}});
912921

913922
addRulesForGOpcs({G_FPTOUI})
914923
.Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID {
9292
V4S32,
9393

9494
UniV2S16,
95+
UniV2S32,
9596

9697
DivV2S16,
98+
DivV2S32,
9799

98100
// B types
99101
B32,
@@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID {
178180
UniInVcc,
179181
UniInVgprS16,
180182
UniInVgprS32,
183+
UniInVgprS64,
181184
UniInVgprV2S16,
185+
UniInVgprV2S32,
182186
UniInVgprV4S32,
183187
UniInVgprB32,
184188
UniInVgprB64,
Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
4+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
5+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
8+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
9+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
10+
11+
define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) {
12+
; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform:
13+
; GFX11-SDAG-FAKE16: ; %bb.0:
14+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
15+
; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
16+
;
17+
; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform:
18+
; GFX11-SDAG-TRUE16: ; %bb.0:
19+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
20+
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
21+
;
22+
; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform:
23+
; GFX11-GISEL-FAKE16: ; %bb.0:
24+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1
25+
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
26+
; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
27+
; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
28+
; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
29+
;
30+
; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform:
31+
; GFX11-GISEL-TRUE16: ; %bb.0:
32+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1
33+
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
34+
; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
35+
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0
36+
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
37+
;
38+
; GFX12-LABEL: fadd_s16_uniform:
39+
; GFX12: ; %bb.0:
40+
; GFX12-NEXT: s_add_f16 s0, s0, s1
41+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
42+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
43+
; GFX12-NEXT: ; return to shader part epilog
44+
%fadd = fadd half %a, %b
45+
ret half %fadd
46+
}
47+
48+
define amdgpu_ps half @fadd_s16_div(half %a, half %b) {
49+
; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div:
50+
; GFX11-SDAG-FAKE16: ; %bb.0:
51+
; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
52+
; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
53+
;
54+
; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div:
55+
; GFX11-SDAG-TRUE16: ; %bb.0:
56+
; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
57+
; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog
58+
;
59+
; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div:
60+
; GFX11-GISEL-FAKE16: ; %bb.0:
61+
; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
62+
; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog
63+
;
64+
; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div:
65+
; GFX11-GISEL-TRUE16: ; %bb.0:
66+
; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
67+
; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog
68+
;
69+
; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div:
70+
; GFX12-SDAG-FAKE16: ; %bb.0:
71+
; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
72+
; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog
73+
;
74+
; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div:
75+
; GFX12-SDAG-TRUE16: ; %bb.0:
76+
; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
77+
; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog
78+
;
79+
; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div:
80+
; GFX12-GISEL-FAKE16: ; %bb.0:
81+
; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
82+
; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog
83+
;
84+
; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div:
85+
; GFX12-GISEL-TRUE16: ; %bb.0:
86+
; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
87+
; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog
88+
%fadd = fadd half %a, %b
89+
ret half %fadd
90+
}
91+
92+
define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) {
93+
; GFX11-LABEL: fadd_s32_uniform:
94+
; GFX11: ; %bb.0:
95+
; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
96+
; GFX11-NEXT: ; return to shader part epilog
97+
;
98+
; GFX12-LABEL: fadd_s32_uniform:
99+
; GFX12: ; %bb.0:
100+
; GFX12-NEXT: s_add_f32 s0, s0, s1
101+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
102+
; GFX12-NEXT: v_mov_b32_e32 v0, s0
103+
; GFX12-NEXT: ; return to shader part epilog
104+
%fadd = fadd float %a, %b
105+
ret float %fadd
106+
}
107+
108+
define amdgpu_ps float @fadd_s32_div(float %a, float %b) {
109+
; GCN-LABEL: fadd_s32_div:
110+
; GCN: ; %bb.0:
111+
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
112+
; GCN-NEXT: ; return to shader part epilog
113+
%fadd = fadd float %a, %b
114+
ret float %fadd
115+
}
116+
117+
define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) {
118+
; GFX11-LABEL: fadd_s64_uniform:
119+
; GFX11: ; %bb.0:
120+
; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3]
121+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
122+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
123+
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
124+
; GFX11-NEXT: ; return to shader part epilog
125+
;
126+
; GFX12-LABEL: fadd_s64_uniform:
127+
; GFX12: ; %bb.0:
128+
; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3]
129+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
130+
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
131+
; GFX12-NEXT: v_readfirstlane_b32 s1, v1
132+
; GFX12-NEXT: s_wait_alu 0xf1ff
133+
; GFX12-NEXT: ; return to shader part epilog
134+
%fadd = fadd double %a, %b
135+
ret double %fadd
136+
}
137+
138+
define amdgpu_ps double @fadd_s64_div(double %a, double %b) {
139+
; GFX11-LABEL: fadd_s64_div:
140+
; GFX11: ; %bb.0:
141+
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
142+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
143+
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
144+
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
145+
; GFX11-NEXT: ; return to shader part epilog
146+
;
147+
; GFX12-LABEL: fadd_s64_div:
148+
; GFX12: ; %bb.0:
149+
; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3]
150+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
151+
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
152+
; GFX12-NEXT: v_readfirstlane_b32 s1, v1
153+
; GFX12-NEXT: ; return to shader part epilog
154+
%fadd = fadd double %a, %b
155+
ret double %fadd
156+
}
157+
158+
define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
159+
; GFX11-LABEL: fadd_v2s16_uniform:
160+
; GFX11: ; %bb.0:
161+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
162+
; GFX11-NEXT: v_pk_add_f16 v0, s0, s1
163+
; GFX11-NEXT: s_setpc_b64 s[30:31]
164+
;
165+
; GFX12-LABEL: fadd_v2s16_uniform:
166+
; GFX12: ; %bb.0:
167+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
168+
; GFX12-NEXT: s_wait_expcnt 0x0
169+
; GFX12-NEXT: s_wait_samplecnt 0x0
170+
; GFX12-NEXT: s_wait_bvhcnt 0x0
171+
; GFX12-NEXT: s_wait_kmcnt 0x0
172+
; GFX12-NEXT: v_pk_add_f16 v0, s0, s1
173+
; GFX12-NEXT: s_setpc_b64 s[30:31]
174+
%fadd = fadd <2 x half> %a, %b
175+
ret <2 x half> %fadd
176+
}
177+
178+
define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) {
179+
; GFX11-LABEL: fadd_v2s16_div:
180+
; GFX11: ; %bb.0:
181+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182+
; GFX11-NEXT: v_pk_add_f16 v0, v0, v1
183+
; GFX11-NEXT: s_setpc_b64 s[30:31]
184+
;
185+
; GFX12-LABEL: fadd_v2s16_div:
186+
; GFX12: ; %bb.0:
187+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
188+
; GFX12-NEXT: s_wait_expcnt 0x0
189+
; GFX12-NEXT: s_wait_samplecnt 0x0
190+
; GFX12-NEXT: s_wait_bvhcnt 0x0
191+
; GFX12-NEXT: s_wait_kmcnt 0x0
192+
; GFX12-NEXT: v_pk_add_f16 v0, v0, v1
193+
; GFX12-NEXT: s_setpc_b64 s[30:31]
194+
%fadd = fadd <2 x half> %a, %b
195+
ret <2 x half> %fadd
196+
}
197+
198+
define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
199+
; GFX11-LABEL: fadd_v2s32_uniform:
200+
; GFX11: ; %bb.0:
201+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
202+
; GFX11-NEXT: v_add_f32_e64 v0, s0, s2
203+
; GFX11-NEXT: v_add_f32_e64 v1, s1, s3
204+
; GFX11-NEXT: s_setpc_b64 s[30:31]
205+
;
206+
; GFX12-LABEL: fadd_v2s32_uniform:
207+
; GFX12: ; %bb.0:
208+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
209+
; GFX12-NEXT: s_wait_expcnt 0x0
210+
; GFX12-NEXT: s_wait_samplecnt 0x0
211+
; GFX12-NEXT: s_wait_bvhcnt 0x0
212+
; GFX12-NEXT: s_wait_kmcnt 0x0
213+
; GFX12-NEXT: s_add_f32 s0, s0, s2
214+
; GFX12-NEXT: s_add_f32 s1, s1, s3
215+
; GFX12-NEXT: s_wait_alu 0xfffe
216+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
217+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
218+
; GFX12-NEXT: s_setpc_b64 s[30:31]
219+
%fadd = fadd <2 x float> %a, %b
220+
ret <2 x float> %fadd
221+
}
222+
223+
define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) {
224+
; GFX11-LABEL: fadd_v2s32_div:
225+
; GFX11: ; %bb.0:
226+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227+
; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
228+
; GFX11-NEXT: s_setpc_b64 s[30:31]
229+
;
230+
; GFX12-LABEL: fadd_v2s32_div:
231+
; GFX12: ; %bb.0:
232+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
233+
; GFX12-NEXT: s_wait_expcnt 0x0
234+
; GFX12-NEXT: s_wait_samplecnt 0x0
235+
; GFX12-NEXT: s_wait_bvhcnt 0x0
236+
; GFX12-NEXT: s_wait_kmcnt 0x0
237+
; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
238+
; GFX12-NEXT: s_setpc_b64 s[30:31]
239+
%fadd = fadd <2 x float> %a, %b
240+
ret <2 x float> %fadd
241+
}
242+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
243+
; GFX11-GISEL: {{.*}}
244+
; GFX11-SDAG: {{.*}}
245+
; GFX12-GISEL: {{.*}}
246+
; GFX12-SDAG: {{.*}}

0 commit comments

Comments
 (0)