Skip to content

Commit 96b2ba3

Browse files
arsenmsearlmc1
authored andcommitted
ockl: Don't use wave32 ballot builtin
Wave32 and wave64 paths cannot really co-exist in the same function or callgraph. They need to be treated as a hard ABI incompatibility. We cannot handle the wave32 operation on wave64, but we can and do handle the wave64 operation on wave32. Given the current linking scheme, the most expedient fix for this not working is to pretend wave32 does not exist and just use the wave64 ballot. The optimizer will fold the 64-bit ballot intrinsic to the 32-bit one when it sees a 32-bit target. This was reported broken in llvm#89332 Change-Id: If897f631066e257e18d8adf574cd17d3f9278ca7
1 parent c98bed9 commit 96b2ba3

File tree

2 files changed

+13
-80
lines changed

2 files changed

+13
-80
lines changed

amd/device-libs/ockl/src/dm.cl

+9-41
Original file line numberDiff line numberDiff line change
@@ -294,9 +294,14 @@ first(__global void * v)
294294
return __builtin_astype(w2, __global void *);
295295
}
296296

297-
REQUIRES_WAVE64
297+
// Read val from one active lane whose predicate is one.
298+
// If no lanes have the predicate set, return none
299+
// This is like first, except that first may not have its predicate set
298300
static uint
299-
elect_uint_wave64(int pred, uint val, uint none) {
301+
elect_uint(int pred, uint val, uint none)
302+
{
303+
// Pretend wave32 doesn't exist. The wave64 ballot works, and the high half
304+
// will fold out as 0.
300305
uint ret = none;
301306

302307
ulong mask = __builtin_amdgcn_ballot_w64(pred != 0);
@@ -308,49 +313,12 @@ elect_uint_wave64(int pred, uint val, uint none) {
308313
return ret;
309314
}
310315

311-
REQUIRES_WAVE32
312-
static uint
313-
elect_uint_wave32(int pred, uint val, uint none) {
314-
uint ret = none;
315-
uint mask = __builtin_amdgcn_ballot_w32(pred != 0);
316-
if (mask != 0U) {
317-
uint l = __ockl_ctz_u32(mask);
318-
ret = __builtin_amdgcn_ds_bpermute(l << 2, val);
319-
}
320-
321-
return ret;
322-
}
323-
324-
// Read val from one active lane whose predicate is one.
325-
// If no lanes have the predicate set, return none
326-
// This is like first, except that first may not have its predicate set
327-
static uint
328-
elect_uint(int pred, uint val, uint none)
329-
{
330-
return __oclc_wavefrontsize64 ? elect_uint_wave64(pred, val, none) : elect_uint_wave32(pred, val, none);
331-
}
332-
333-
REQUIRES_WAVE64
334-
static uint
335-
votes_wave64(bool b)
336-
{
337-
ulong mask = __builtin_amdgcn_ballot_w64(b);
338-
return __builtin_popcountl(mask);
339-
}
340-
341-
REQUIRES_WAVE32
342-
static uint
343-
votes_wave32(bool b)
344-
{
345-
uint mask = __builtin_amdgcn_ballot_w32(b);
346-
return __builtin_popcount(mask);
347-
}
348-
349316
// Count the number of nonzero arguments across the wave
350317
static uint
351318
votes(bool b)
352319
{
353-
return __oclc_wavefrontsize64 ? votes_wave64(b) : votes_wave32(b);
320+
ulong mask = __builtin_amdgcn_ballot_w64(b);
321+
return __builtin_popcountl(mask);
354322
}
355323

356324
// The kind of the smallest block that can hold sz bytes

amd/device-libs/ockl/src/wfaas.cl

+4-39
Original file line numberDiff line numberDiff line change
@@ -22,60 +22,25 @@ static int optimizationBarrierHack(int in_val)
2222
return out_val;
2323
}
2424

25-
REQUIRES_WAVE32
26-
static bool wfany_impl_w32(int e) {
27-
return __builtin_amdgcn_ballot_w32(e) != 0;
28-
}
29-
30-
REQUIRES_WAVE64
31-
static bool wfany_impl_w64(int e) {
32-
return __builtin_amdgcn_ballot_w64(e) != 0;
33-
}
34-
3525
ATTR bool
3626
OCKL_MANGLE_I32(wfany)(int e)
3727
{
3828
e = optimizationBarrierHack(e);
39-
return __oclc_wavefrontsize64 ?
40-
wfany_impl_w64(e) : wfany_impl_w32(e);
41-
}
42-
43-
REQUIRES_WAVE32
44-
static bool wfall_impl_w32(int e) {
45-
return __builtin_amdgcn_ballot_w32(e) == __builtin_amdgcn_read_exec_lo();
46-
}
47-
48-
REQUIRES_WAVE64
49-
static bool wfall_impl_w64(int e) {
50-
return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
29+
return __builtin_amdgcn_ballot_w64(e) != 0;
5130
}
5231

5332
ATTR bool
5433
OCKL_MANGLE_I32(wfall)(int e)
5534
{
5635
e = optimizationBarrierHack(e);
57-
return __oclc_wavefrontsize64 ?
58-
wfall_impl_w64(e) : wfall_impl_w32(e);
59-
}
60-
61-
62-
REQUIRES_WAVE32
63-
static bool wfsame_impl_w32(int e) {
64-
uint u = __builtin_amdgcn_ballot_w32(e);
65-
return (u == 0) | (u == __builtin_amdgcn_read_exec_lo());
66-
}
67-
68-
REQUIRES_WAVE64
69-
static bool wfsame_impl_w64(int e) {
70-
ulong u = __builtin_amdgcn_ballot_w64(e);
71-
return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
36+
return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
7237
}
7338

7439
ATTR bool
7540
OCKL_MANGLE_I32(wfsame)(int e)
7641
{
7742
e = optimizationBarrierHack(e);
78-
return __oclc_wavefrontsize64 ?
79-
wfsame_impl_w64(e) : wfsame_impl_w32(e);
43+
ulong u = __builtin_amdgcn_ballot_w64(e);
44+
return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
8045
}
8146

0 commit comments

Comments
 (0)