From 62e436d89ddb393882688aee0b35aeda0f5f287d Mon Sep 17 00:00:00 2001 From: Konstantin S Bobrovsky Date: Fri, 31 May 2019 19:34:10 -0700 Subject: [PATCH] [SYCL] Implement hierarchical parallelism API. This is the first part of SYCL hierarchical parallelism implementation. It implements main related APIs: - h_item class - group::parallel_for_work_item functions - handler::parallel_for_work_group functions It is able to run workloads which use these APIs but do not contain data or code with group-visible side effects between the work group and work item scopes. Signed-off-by: Konstantin S Bobrovsky --- sycl/include/CL/sycl/accessor.hpp | 97 +++++------ sycl/include/CL/sycl/detail/cg.hpp | 97 ++++++++++- sycl/include/CL/sycl/detail/common.hpp | 98 ++++++++++++ sycl/include/CL/sycl/detail/helpers.hpp | 18 ++- sycl/include/CL/sycl/group.hpp | 128 ++++++++++++++- sycl/include/CL/sycl/h_item.hpp | 131 +++++++++++++++ sycl/include/CL/sycl/handler.hpp | 97 ++++++++--- sycl/include/CL/sycl/id.hpp | 36 ++--- sycl/include/CL/sycl/item.hpp | 5 + sycl/source/detail/scheduler/commands.cpp | 41 ++++- sycl/test/hier_par/hier_par_basic.cpp | 187 ++++++++++++++++++++++ 11 files changed, 834 insertions(+), 101 deletions(-) create mode 100644 sycl/include/CL/sycl/h_item.hpp create mode 100644 sycl/test/hier_par/hier_par_basic.cpp diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp index e7d3b9e16613..48936bfe84c1 100644 --- a/sycl/include/CL/sycl/accessor.hpp +++ b/sycl/include/CL/sycl/accessor.hpp @@ -198,9 +198,6 @@ class accessor_common { template class AccessorSubscript { static constexpr int Dims = Dimensions; - template - using enable_if_t = typename std::enable_if::type; - mutable id MIDs; AccType MAccessor; @@ -215,21 +212,21 @@ class accessor_common { } template - typename std::enable_if<(CurDims > 1), AccessorSubscript>::type + typename detail::enable_if_t<(CurDims > 1), AccessorSubscript> operator[](size_t Index) { MIDs[Dims - CurDims] = Index; return AccessorSubscript(MAccessor, MIDs); } template > + typename = detail::enable_if_t> RefType operator[](size_t Index) const { MIDs[Dims - CurDims] = Index; return MAccessor[MIDs]; } template > + typename = detail::enable_if_t> DataT operator[](size_t Index) const { MIDs[Dims - SubDims] = Index; return MAccessor[MIDs]; @@ -253,9 +250,6 @@ class accessor : AccessTarget == access::target::host_buffer), "Expected buffer type"); - template - using enable_if_t = typename std::enable_if::type; - using AccessorCommonT = detail::accessor_common; @@ -340,7 +334,7 @@ class accessor : template accessor( - enable_if_t> &BufferRef) #ifdef __SYCL_DEVICE_ONLY__ @@ -362,7 +356,7 @@ class accessor : template accessor( buffer &BufferRef, - enable_if_t &CommandGroupHandler) #ifdef __SYCL_DEVICE_ONLY__ : impl(id(), BufferRef.get_range(), BufferRef.MemRange) { @@ -379,7 +373,7 @@ class accessor : #endif template 0) && ((!IsPlaceH && IsHostBuf) || (IsPlaceH && (IsGlobalBuf || IsConstantBuf)))>> accessor(buffer &BufferRef) @@ -401,7 +395,7 @@ class accessor : #endif template 0) && (!IsPlaceH && (IsGlobalBuf || IsConstantBuf))>> accessor(buffer &BufferRef, handler &CommandGroupHandler) #ifdef __SYCL_DEVICE_ONLY__ @@ -419,7 +413,7 @@ class accessor : #endif template 0) && ((!IsPlaceH && IsHostBuf) || (IsPlaceH && (IsGlobalBuf || IsConstantBuf)))>> accessor(buffer &BufferRef, range AccessRange, @@ -441,7 +435,7 @@ class accessor : #endif template 0) && (!IsPlaceH && (IsGlobalBuf || IsConstantBuf))>> accessor(buffer &BufferRef, handler &CommandGroupHandler, range AccessRange, id AccessOffset = {}) @@ -464,53 +458,53 @@ class accessor : size_t get_count() const { return getMemoryRange().size(); } - template 0)>> + template 0)>> range get_range() const { return detail::convertToArrayOfN(getAccessRange()); } - template 0)>> + template 0)>> id get_offset() const { return detail::convertToArrayOfN(getOffset()); } template > + typename = detail::enable_if_t> operator RefType() const { const size_t LinearIndex = getLinearIndex(id()); return *(getQualifiedPtr() + LinearIndex); } template 0) && IsAccessAnyWrite>> + typename = detail::enable_if_t<(Dims > 0) && IsAccessAnyWrite>> RefType operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return getQualifiedPtr()[LinearIndex]; } template > + typename = detail::enable_if_t> RefType operator[](size_t Index) const { const size_t LinearIndex = getLinearIndex(id(Index)); return getQualifiedPtr()[LinearIndex]; } template > + typename = detail::enable_if_t> operator DataT() const { const size_t LinearIndex = getLinearIndex(id()); return *(getQualifiedPtr() + LinearIndex); } template 0) && IsAccessReadOnly>> + typename = detail::enable_if_t<(Dims > 0) && IsAccessReadOnly>> DataT operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return getQualifiedPtr()[LinearIndex]; } template > + typename = detail::enable_if_t> DataT operator[](size_t Index) const { const size_t LinearIndex = getLinearIndex(id(Index)); return getQualifiedPtr()[LinearIndex]; @@ -535,39 +529,39 @@ class accessor : } template - typename enable_if_t>::type + typename detail::enable_if_t>::type operator[](size_t Index) const { const size_t LinearIndex = getLinearIndex(id(Index)); return atomic( multi_ptr(getQualifiedPtr() + LinearIndex)); } - template 1)>> + template 1)>> typename AccessorCommonT::template AccessorSubscript operator[](size_t Index) const { return AccessorSubscript(*this, Index); } - template < - access::target AccessTarget_ = AccessTarget, - typename = enable_if_t> + template > DataT *get_pointer() const { const size_t LinearIndex = getLinearIndex(id()); return getQualifiedPtr() + LinearIndex; } - template < - access::target AccessTarget_ = AccessTarget, - typename = enable_if_t> + template > global_ptr get_pointer() const { const size_t LinearIndex = getLinearIndex(id()); return global_ptr(getQualifiedPtr() + LinearIndex); } - template < - access::target AccessTarget_ = AccessTarget, - typename = enable_if_t> + template > constant_ptr get_pointer() const { const size_t LinearIndex = getLinearIndex(id()); return constant_ptr(getQualifiedPtr() + LinearIndex); @@ -603,9 +597,6 @@ class accessor::type &; using PtrType = typename detail::PtrValueType::type *; - template - using enable_if_t = typename std::enable_if::type; - #ifdef __SYCL_DEVICE_ONLY__ detail::LocalAccessorBaseDevice impl; @@ -648,7 +639,7 @@ class accessor> + template > accessor(handler &CommandGroupHandler) #ifdef __SYCL_DEVICE_ONLY__ : impl(range{1}) { @@ -658,7 +649,7 @@ class accessor 0)>> + template 0)>> accessor(range AllocationSize, handler &CommandGroupHandler) #ifdef __SYCL_DEVICE_ONLY__ : impl(AllocationSize) { @@ -674,48 +665,48 @@ class accessor> + typename = detail::enable_if_t> operator RefType() const { return *getQualifiedPtr(); } template 0) && IsAccessAnyWrite>> + typename = detail::enable_if_t<(Dims > 0) && IsAccessAnyWrite>> RefType operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return getQualifiedPtr()[LinearIndex]; } template > + typename = detail::enable_if_t> RefType operator[](size_t Index) const { return getQualifiedPtr()[Index]; } - template < - int Dims = Dimensions, - typename = enable_if_t> + template > operator atomic() const { return atomic(multi_ptr(getQualifiedPtr())); } - template < - int Dims = Dimensions, - typename = enable_if_t<(Dims > 0) && AccessMode == access::mode::atomic>> + template 0) && + AccessMode == access::mode::atomic>> atomic operator[](id Index) const { const size_t LinearIndex = getLinearIndex(Index); return atomic( multi_ptr(getQualifiedPtr() + LinearIndex)); } - template < - int Dims = Dimensions, - typename = enable_if_t> + template > atomic operator[](size_t Index) const { return atomic(multi_ptr(getQualifiedPtr() + Index)); } - template 1)>> + template 1)>> typename AccessorCommonT::template AccessorSubscript operator[](size_t Index) const { return AccessorSubscript(*this, Index); diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index 1a8eb3b98e33..1bb1351c14b0 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -9,8 +9,10 @@ #pragma once #include +#include #include #include +#include #include #include #include @@ -45,11 +47,12 @@ class ArgDesc { class NDRDescT { // The method initializes all sizes for dimensions greater than the passed one // to the default values, so they will not affect execution. - template void setNDRangeLeftover() { + void setNDRangeLeftover(int Dims_) { for (int I = Dims_; I < 3; ++I) { GlobalSize[I] = 1; LocalSize[I] = LocalSize[0] ? 1 : 0; GlobalOffset[I] = 0; + NumWorkGroups[I] = 0; } } @@ -61,9 +64,23 @@ class NDRDescT { GlobalSize[I] = NumWorkItems[I]; LocalSize[I] = 0; GlobalOffset[I] = 0; + NumWorkGroups[I] = 0; } + setNDRangeLeftover(Dims_); + Dims = Dims_; + } - setNDRangeLeftover(); + // Initializes this ND range descriptor with given range of work items and + // offset. + template + void set(sycl::range NumWorkItems, sycl::id Offset) { + for (int I = 0; I < Dims_; ++I) { + GlobalSize[I] = NumWorkItems[I]; + LocalSize[I] = 0; + GlobalOffset[I] = Offset[I]; + NumWorkGroups[I] = 0; + } + setNDRangeLeftover(Dims_); Dims = Dims_; } @@ -72,14 +89,42 @@ class NDRDescT { GlobalSize[I] = ExecutionRange.get_global_range()[I]; LocalSize[I] = ExecutionRange.get_local_range()[I]; GlobalOffset[I] = ExecutionRange.get_offset()[I]; + NumWorkGroups[I] = 0; } - setNDRangeLeftover(); + setNDRangeLeftover(Dims_); + Dims = Dims_; + } + + void set(int Dims_, sycl::nd_range<3> ExecutionRange) { + for (int I = 0; I < Dims_; ++I) { + GlobalSize[I] = ExecutionRange.get_global_range()[I]; + LocalSize[I] = ExecutionRange.get_local_range()[I]; + GlobalOffset[I] = ExecutionRange.get_offset()[I]; + NumWorkGroups[I] = 0; + } + setNDRangeLeftover(Dims_); + Dims = Dims_; + } + + template void setNumWorkGroups(sycl::range N) { + for (int I = 0; I < Dims_; ++I) { + GlobalSize[I] = 0; + // '0' is a mark to adjust before kernel launch when there is enough info: + LocalSize[I] = 0; + GlobalOffset[I] = 0; + NumWorkGroups[I] = N[I]; + } + setNDRangeLeftover(Dims_); Dims = Dims_; } sycl::range<3> GlobalSize; sycl::range<3> LocalSize; sycl::id<3> GlobalOffset; + /// Number of workgroups, used to record the number of workgroups from the + /// simplest form of parallel_for_work_group. If set, all other fields must be + /// zero + sycl::range<3> NumWorkGroups; size_t Dims; }; @@ -102,7 +147,26 @@ class HostKernel : public HostKernelBase { public: HostKernel(KernelType Kernel) : MKernel(Kernel) {} - void call(const NDRDescT &NDRDesc) override { runOnHost(NDRDesc); } + void call(const NDRDescT &NDRDesc) override { + // adjust ND range for serial host: + NDRDescT AdjustedRange; + bool Adjust = false; + + if (NDRDesc.GlobalSize[0] == 0 && NDRDesc.NumWorkGroups[0] != 0) { + // This is a special case - NDRange information is not complete, only the + // desired number of work groups is set by the user. Choose work group + // size (LocalSize), calculate the missing NDRange characteristics + // needed to invoke the kernel and adjust the NDRange descriptor + // accordingly. For some devices the work group size selection requires + // access to the device's properties, hence such late "adjustment". + range<3> WGsize = {1, 1, 1}; // no better alternative for serial host? + AdjustedRange.set(NDRDesc.Dims, + nd_range<3>(NDRDesc.NumWorkGroups * WGsize, WGsize)); + Adjust = true; + } + const NDRDescT &R = Adjust ? AdjustedRange : NDRDesc; + runOnHost(R); + } char *getPtr() override { return reinterpret_cast(&MKernel); } @@ -165,6 +229,7 @@ class HostKernel : public HostKernelBase { sycl::id<3> GroupSize; for (int I = 0; I < 3; ++I) { GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; + // TODO supoport case NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0 } sycl::range GlobalSize; @@ -217,6 +282,30 @@ class HostKernel : public HostKernelBase { } } } + + template + enable_if_t>::value> + runOnHost(const NDRDescT &NDRDesc) { + sycl::id NGroups; + + for (int I = 0; I < Dims; ++I) { + NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; + assert(NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] == 0); + } + sycl::range GlobalSize; + sycl::range LocalSize; + + for (int I = 0; I < Dims; ++I) { + LocalSize[I] = NDRDesc.LocalSize[I]; + GlobalSize[I] = NDRDesc.GlobalSize[I]; + } + detail::NDLoop::iterate(NGroups, [&](const id &GroupID) { + sycl::group Group = + IDBuilder::createGroup(GlobalSize, LocalSize, GroupID); + MKernel(Group); + }); + } + ~HostKernel() = default; }; diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp index adb5b2beb28d..2acb68487b0e 100644 --- a/sycl/include/CL/sycl/detail/common.hpp +++ b/sycl/include/CL/sycl/detail/common.hpp @@ -126,6 +126,104 @@ template T createSyclObjFromImpl(decltype(T::impl) ImplObj) { return T(ImplObj); } +// Produces N-dimensional object of type T whose all components are initialized +// to given integer value. +template class T> struct InitializedVal { + template static T &&get(); +}; + +// Specialization for a one-dimensional type. +template