Skip to content

Commit 07c5920

Browse files
committed
Reland "[AMDGPU] Wave32 CodeGen for amdgcn.ballot.i64"
This time without the extra `->dump()` A recent addition to the device libs, `__ockl_dm_trim`, caused a series of failures at O0 due to a i64 ballot intrinsic being inlined into a wave32 function. The quick fix for this is to support codegen for this rare case. A proper long-term fix for this type of issue is still being discussed. Fixes SWDEV-408929, SWDEV-408957, SWDEV-409885, SWDEV-410193 Reviewed By: #amdgpu, arsenm Differential Revision: https://reviews.llvm.org/D155050
1 parent 5fca4ce commit 07c5920

File tree

3 files changed

+160
-21
lines changed

3 files changed

+160
-21
lines changed

Diff for: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

+26-9
Original file line numberDiff line numberDiff line change
@@ -1326,27 +1326,44 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
13261326
Register DstReg = I.getOperand(0).getReg();
13271327
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
13281328
const bool Is64 = Size == 64;
1329+
const bool IsWave32 = (STI.getWavefrontSize() == 32);
13291330

1330-
if (Size != STI.getWavefrontSize())
1331+
// In the common case, the return type matches the wave size.
1332+
// However we also support emitting i64 ballots in wave32 mode.
1333+
if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
13311334
return false;
13321335

13331336
std::optional<ValueAndVReg> Arg =
13341337
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
13351338

1339+
const auto BuildCopy = [&](Register SrcReg) {
1340+
if (Size == STI.getWavefrontSize()) {
1341+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1342+
.addReg(SrcReg);
1343+
return;
1344+
}
1345+
1346+
// If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1347+
Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1348+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1349+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1350+
.addReg(SrcReg)
1351+
.addImm(AMDGPU::sub0)
1352+
.addReg(HiReg)
1353+
.addImm(AMDGPU::sub1);
1354+
};
1355+
13361356
if (Arg) {
13371357
const int64_t Value = Arg->Value.getSExtValue();
13381358
if (Value == 0) {
13391359
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
13401360
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1341-
} else if (Value == -1) { // all ones
1342-
Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1343-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1344-
} else
1361+
} else if (Value == -1) // all ones
1362+
BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1363+
else
13451364
return false;
1346-
} else {
1347-
Register SrcReg = I.getOperand(2).getReg();
1348-
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1349-
}
1365+
} else
1366+
BuildCopy(I.getOperand(2).getReg());
13501367

13511368
I.eraseFromParent();
13521369
return true;

Diff for: llvm/lib/Target/AMDGPU/VOPCInstructions.td

+28-12
Original file line numberDiff line numberDiff line change
@@ -992,11 +992,18 @@ multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
992992
(i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
993993
>;
994994

995-
let WaveSizePredicate = isWave32 in
996-
def : GCNPat <
997-
(i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
998-
(i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
999-
>;
995+
let WaveSizePredicate = isWave32 in {
996+
def : GCNPat <
997+
(i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
998+
(i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
999+
>;
1000+
1001+
// Support codegen of i64 setcc in wave32 mode.
1002+
def : GCNPat <
1003+
(i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
1004+
(i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1))
1005+
>;
1006+
}
10001007
}
10011008

10021009
defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
@@ -1056,13 +1063,22 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
10561063
DSTCLAMP.NONE), SReg_64))
10571064
>;
10581065

1059-
let WaveSizePredicate = isWave32 in
1060-
def : GCNPat <
1061-
(i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
1062-
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
1063-
(i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
1064-
DSTCLAMP.NONE), SReg_32))
1065-
>;
1066+
let WaveSizePredicate = isWave32 in {
1067+
def : GCNPat <
1068+
(i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
1069+
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
1070+
(i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
1071+
DSTCLAMP.NONE), SReg_32))
1072+
>;
1073+
1074+
def : GCNPat <
1075+
(i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
1076+
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
1077+
(i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
1078+
DSTCLAMP.NONE), sub0,
1079+
(S_MOV_B32 (i32 0)), sub1))
1080+
>;
1081+
}
10661082
}
10671083

10681084
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
3+
; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL
4+
; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
5+
; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL
6+
7+
declare i64 @llvm.amdgcn.ballot.i64(i1)
8+
declare i64 @llvm.ctpop.i64(i64)
9+
10+
; Test ballot(0)
11+
12+
define amdgpu_cs i64 @constant_false() {
13+
; CHECK-LABEL: constant_false:
14+
; CHECK: ; %bb.0:
15+
; CHECK-NEXT: s_mov_b32 s0, 0
16+
; CHECK-NEXT: s_mov_b32 s1, 0
17+
; CHECK-NEXT: ; return to shader part epilog
18+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0)
19+
ret i64 %ballot
20+
}
21+
22+
; Test ballot(1)
23+
24+
define amdgpu_cs i64 @constant_true() {
25+
; DAGISEL-LABEL: constant_true:
26+
; DAGISEL: ; %bb.0:
27+
; DAGISEL-NEXT: s_mov_b32 s0, exec_lo
28+
; DAGISEL-NEXT: s_mov_b32 s1, exec_hi
29+
; DAGISEL-NEXT: ; return to shader part epilog
30+
;
31+
; GISEL-LABEL: constant_true:
32+
; GISEL: ; %bb.0:
33+
; GISEL-NEXT: s_mov_b32 s0, exec_lo
34+
; GISEL-NEXT: s_mov_b32 s1, 0
35+
; GISEL-NEXT: ; return to shader part epilog
36+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1)
37+
ret i64 %ballot
38+
}
39+
40+
; Test ballot of a non-comparison operation
41+
42+
define amdgpu_cs i64 @non_compare(i32 %x) {
43+
; CHECK-LABEL: non_compare:
44+
; CHECK: ; %bb.0:
45+
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
46+
; CHECK-NEXT: s_mov_b32 s1, 0
47+
; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0
48+
; CHECK-NEXT: ; return to shader part epilog
49+
%trunc = trunc i32 %x to i1
50+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc)
51+
ret i64 %ballot
52+
}
53+
54+
; Test ballot of comparisons
55+
56+
define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) {
57+
; CHECK-LABEL: compare_ints:
58+
; CHECK: ; %bb.0:
59+
; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1
60+
; CHECK-NEXT: s_mov_b32 s1, 0
61+
; CHECK-NEXT: ; return to shader part epilog
62+
%cmp = icmp eq i32 %x, %y
63+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
64+
ret i64 %ballot
65+
}
66+
67+
define amdgpu_cs i64 @compare_int_with_constant(i32 %x) {
68+
; DAGISEL-LABEL: compare_int_with_constant:
69+
; DAGISEL: ; %bb.0:
70+
; DAGISEL-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0
71+
; DAGISEL-NEXT: s_mov_b32 s1, 0
72+
; DAGISEL-NEXT: ; return to shader part epilog
73+
;
74+
; GISEL-LABEL: compare_int_with_constant:
75+
; GISEL: ; %bb.0:
76+
; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0
77+
; GISEL-NEXT: s_mov_b32 s1, 0
78+
; GISEL-NEXT: ; return to shader part epilog
79+
%cmp = icmp sge i32 %x, 99
80+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
81+
ret i64 %ballot
82+
}
83+
84+
define amdgpu_cs i64 @compare_floats(float %x, float %y) {
85+
; CHECK-LABEL: compare_floats:
86+
; CHECK: ; %bb.0:
87+
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
88+
; CHECK-NEXT: s_mov_b32 s1, 0
89+
; CHECK-NEXT: ; return to shader part epilog
90+
%cmp = fcmp ogt float %x, %y
91+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
92+
ret i64 %ballot
93+
}
94+
95+
define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
96+
; CHECK-LABEL: ctpop_of_ballot:
97+
; CHECK: ; %bb.0:
98+
; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1
99+
; CHECK-NEXT: s_mov_b32 s1, 0
100+
; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
101+
; CHECK-NEXT: ; return to shader part epilog
102+
%cmp = fcmp ogt float %x, %y
103+
%ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
104+
%bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
105+
ret i64 %bcnt
106+
}

0 commit comments

Comments
 (0)