Skip to content

Commit

Permalink
Initial partially working nvptx ballot_group algs.
Browse files Browse the repository at this point in the history
Signed-off-by: JackAKirk <jack.kirk@codeplay.com>

cluster/ballot/opportunistic_group cuda support.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
  • Loading branch information
JackAKirk committed Apr 6, 2023
1 parent 56e05ce commit c546762
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 8 deletions.
7 changes: 6 additions & 1 deletion clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#pragma push_macro("PTX42")
#pragma push_macro("PTX60")
#pragma push_macro("PTX61")
#pragma push_macro("PTX62")
#pragma push_macro("PTX63")
#pragma push_macro("PTX64")
#pragma push_macro("PTX65")
Expand All @@ -66,7 +67,8 @@
#define PTX65 "ptx65|" PTX70
#define PTX64 "ptx64|" PTX65
#define PTX63 "ptx63|" PTX64
#define PTX61 "ptx61|" PTX63
#define PTX62 "ptx62|" PTX63
#define PTX61 "ptx61|" PTX62
#define PTX60 "ptx60|" PTX61
#define PTX42 "ptx42|" PTX60

Expand Down Expand Up @@ -594,6 +596,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)

// Activemask
TARGET_BUILTIN(__nvvm_activemask, "Ui", "", PTX62)

// Match
TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
Expand Down
2 changes: 1 addition & 1 deletion libclc/ptx-nvidiacl/libspirv/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ images/image_helpers.ll
images/image.cl
group/collectives_helpers.ll
group/collectives.cl
group/group_ballot.cl
group/group_non_uniform.cl
atomic/atomic_add.cl
atomic/atomic_and.cl
atomic/atomic_cmpxchg.cl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include "membermask.h"
#include <integer/popcount.h>

#include <spirv/spirv.h>
#include <spirv/spirv_types.h>
Expand All @@ -30,7 +31,12 @@ _Z29__spirv_GroupNonUniformBallotjb(unsigned flag, bool predicate) {
unsigned threads = __clc__membermask();

// run the ballot operation
res[0] = __nvvm_vote_ballot_sync(threads, predicate);
res[0] = __nvvm_vote_ballot_sync(threads, predicate); // couldnt call this within intel impl because undefined behaviour if not all reach it?

return res;
}

_CLC_DEF _CLC_CONVERGENT uint _Z37__spirv_GroupNonUniformBallotBitCountN5__spv5Scope4FlagEiDv4_j(uint scope, uint flag, __clc_vec4_uint32_t mask) {

return __clc_native_popcount(__nvvm_read_ptx_sreg_lanemask_lt() & mask[0]);
}
5 changes: 5 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -4628,6 +4628,11 @@ def int_nvvm_match_all_sync_i64p :
Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;

// activemask.b32 d;
def int_nvvm_activemask_ui : ClangBuiltin<"__nvvm_activemask">,
Intrinsic<[llvm_i32_ty], [],
[IntrConvergent, IntrInaccessibleMemOnly]>;

//
// REDUX.SYNC
//
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,13 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC<Int32Regs, "b32", int_nvvm_match_all_s
defm MATCH_ALLP_SYNC_64 : MATCH_ALLP_SYNC<Int64Regs, "b64", int_nvvm_match_all_sync_i64p,
i64imm>;

// reqs ptx62 sm_30;
// activemask.b32 d;
def INT_ACTIVEMASK :
NVPTXInst<(outs Int32Regs:$dest), (ins),
"activemask.b32 \t$dest;",
[(set Int32Regs:$dest, (int_nvvm_activemask_ui))]>;

multiclass REDUX_SYNC<string BinOp, string PTXType, Intrinsic Intrin> {
def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask),
"redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;",
Expand Down
18 changes: 17 additions & 1 deletion sycl/include/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ template <typename Group> bool GroupAll(Group g, bool pred) {
template <typename ParentGroup>
bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
bool pred) {
#if defined (__SPIR__)
// ballot_group partitions its parent into two groups (0 and 1)
// We have to force each group down different control flow
// Work-items in the "false" group (0) may still be active
Expand All @@ -117,6 +118,10 @@ bool GroupAll(ext::oneapi::experimental::ballot_group<ParentGroup> g,
} else {
return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
}
#elif defined (__NVPTX__)
sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
return __nvvm_vote_all_sync(MemberMask[0], pred);
#endif
}

template <typename Group> bool GroupAny(Group g, bool pred) {
Expand All @@ -125,6 +130,7 @@ template <typename Group> bool GroupAny(Group g, bool pred) {
template <typename ParentGroup>
bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
bool pred) {
#if defined (__SPIR__)
// ballot_group partitions its parent into two groups (0 and 1)
// We have to force each group down different control flow
// Work-items in the "false" group (0) may still be active
Expand All @@ -133,6 +139,10 @@ bool GroupAny(ext::oneapi::experimental::ballot_group<ParentGroup> g,
} else {
return __spirv_GroupNonUniformAny(group_scope<ParentGroup>::value, pred);
}
#elif defined (__NVPTX__)
sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
return __nvvm_vote_any_sync(MemberMask[0], pred);
#endif
}

// Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
Expand Down Expand Up @@ -219,13 +229,18 @@ GroupBroadcast(sycl::ext::oneapi::experimental::ballot_group<ParentGroup> g,
// ballot_group partitions its parent into two groups (0 and 1)
// We have to force each group down different control flow
// Work-items in the "false" group (0) may still be active
#if defined(__SPIR__)
if (g.get_group_id() == 1) {
return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
OCLX, OCLId);
} else {
return __spirv_GroupNonUniformBroadcast(group_scope<ParentGroup>::value,
OCLX, OCLId);
}
#elif defined(__NVPTX__)
sycl::vec<unsigned, 4> MemberMask = detail::ExtractMask(detail::GetMask(g));
return __nvvm_shfl_sync_idx_i32(MemberMask[0], x, LocalId, 31); //31 not 32 as docs suggest.
#endif
}

template <typename Group, typename T, typename IdT>
Expand Down Expand Up @@ -886,7 +901,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
template <typename Group>
typename std::enable_if_t<
ext::oneapi::experimental::is_user_constructed_group_v<Group>>
ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
#if defined(__SPIR__)
// SPIR-V does not define an instruction to synchronize partial groups.
// However, most (possibly all?) of the current SPIR-V targets execute
Expand All @@ -899,6 +914,7 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
#elif defined(__NVPTX__)
// TODO: Call syncwarp with appropriate mask extracted from the group
__nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
#endif
}

Expand Down
4 changes: 2 additions & 2 deletions sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ template <typename ParentGroup> class ballot_group {
friend ballot_group<ParentGroup>
get_ballot_group<ParentGroup>(ParentGroup g, bool predicate);

friend uint32_t sycl::detail::IdToMaskPosition<ballot_group<ParentGroup>>(
ballot_group<ParentGroup> Group, uint32_t Id);
friend sub_group_mask sycl::detail::GetMask<ballot_group<ParentGroup>>(ballot_group<ParentGroup> Group);

};

template <typename Group>
Expand Down
19 changes: 19 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/cluster_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include <sycl/ext/oneapi/experimental/non_uniform_groups.hpp>
#include <sycl/ext/oneapi/sub_group_mask.hpp>

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
Expand Down Expand Up @@ -111,8 +112,17 @@ template <size_t ClusterSize, typename ParentGroup> class cluster_group {
#endif
}

#if defined (__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
private:
sub_group_mask Mask;
#endif

protected:
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
cluster_group(ext::oneapi::sub_group_mask mask):Mask(mask) {}
#else
cluster_group() {}
#endif

friend cluster_group<ClusterSize, ParentGroup>
get_cluster_group<ClusterSize, ParentGroup>(ParentGroup g);
Expand All @@ -125,7 +135,16 @@ inline std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
get_cluster_group(Group group) {
(void)group;
#ifdef __SYCL_DEVICE_ONLY__
#if defined(__NVPTX__)
uint32_t loc_id = group.get_local_linear_id();
uint32_t loc_size = group.get_local_linear_range();
uint32_t bits = (1 << ClusterSize) - 1;

return cluster_group<ClusterSize, sycl::sub_group>(sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(
bits << ((loc_id / ClusterSize) * ClusterSize), loc_size));
#else
return cluster_group<ClusterSize, sycl::sub_group>();
#endif
#else
throw runtime_error("Non-uniform groups are not supported on host device.",
PI_ERROR_INVALID_DEVICE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,16 @@ inline uint32_t CallerPositionInMask(ext::oneapi::sub_group_mask Mask) {
}
#endif

//todo inline works?
template <typename NonUniformGroup>
inline ext::oneapi::sub_group_mask GetMask(NonUniformGroup Group) {
return Group.Mask;
}

template <typename NonUniformGroup>
inline uint32_t IdToMaskPosition(NonUniformGroup Group, uint32_t Id) {
// TODO: This will need to be optimized
sycl::vec<unsigned, 4> MemberMask = ExtractMask(Group.Mask);
sycl::vec<unsigned, 4> MemberMask = ExtractMask(GetMask(Group));
uint32_t Count = 0;
for (int i = 0; i < 4; ++i) {
for (int b = 0; b < 32; ++b) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ inline opportunistic_group get_opportunistic_group() {
sub_group_mask mask = sycl::ext::oneapi::group_ballot(sg, true);
return opportunistic_group(mask);
#elif defined(__NVPTX__)
// TODO: Construct from __activemask
sub_group_mask mask = sycl::detail::Builder::createSubGroupMask<ext::oneapi::sub_group_mask>(__nvvm_activemask(), 32);
return opportunistic_group(mask);
#endif
#else
throw runtime_error("Non-uniform groups are not supported on host device.",
Expand Down

0 comments on commit c546762

Please sign in to comment.