Skip to content

Commit

Permalink
[SYCL][NFCI][ABI-Break] Move handler members to impl (intel#14460)
Browse files Browse the repository at this point in the history
This moves some of the members in the handler class to its impl class.
Doing so allows developers to change these arguments without breaking
ABI. This also moves more implementation details, such as command-group
classes, launch configuration information, argument information and
HostTask tracking, into sources to avoid hard-to-find ABI breaks in the
communication between headers and runtime library.

In addition to this, the following improvements are made:
* The HostKernel class has been simplified to no longer have call and
runOnHost functions.
* The HostTask wrapper class has been moved to sources and the owner has
been changed from a unique_ptr to a shared_ptr, which prevents the need
for including host_task_impl.hpp in odd places.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
  • Loading branch information
steffenlarsen authored and smanna12 committed Jul 16, 2024
1 parent 4d1ef3a commit 0c368b1
Show file tree
Hide file tree
Showing 85 changed files with 1,102 additions and 1,360 deletions.
309 changes: 28 additions & 281 deletions sycl/include/sycl/detail/cg_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,117 +37,34 @@ class handler;
namespace detail {
class HostTask;

// The structure represents kernel argument.
class ArgDesc {
public:
ArgDesc(sycl::detail::kernel_param_kind_t Type, void *Ptr, int Size,
int Index)
: MType(Type), MPtr(Ptr), MSize(Size), MIndex(Index) {}

sycl::detail::kernel_param_kind_t MType;
void *MPtr;
int MSize;
int MIndex;
};

// The structure represents NDRange - global, local sizes, global offset,
// number of dimensions, and the cluster dimensions if applicable.
class NDRDescT {
// The method initializes all sizes for dimensions greater than the passed one
// to the default values, so they will not affect execution.
void setNDRangeLeftover(int Dims_) {
for (int I = Dims_; I < 3; ++I) {
GlobalSize[I] = 1;
LocalSize[I] = LocalSize[0] ? 1 : 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = 0;
}
}

public:
NDRDescT()
: GlobalSize{0, 0, 0}, LocalSize{0, 0, 0}, NumWorkGroups{0, 0, 0},
Dims{0} {}

template <int Dims_> void set(sycl::range<Dims_> NumWorkItems) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = NumWorkItems[I];
LocalSize[I] = 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

// Initializes this ND range descriptor with given range of work items and
// offset.
template <int Dims_>
void set(sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = NumWorkItems[I];
LocalSize[I] = 0;
GlobalOffset[I] = Offset[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

template <int Dims_> void set(sycl::nd_range<Dims_> ExecutionRange) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = ExecutionRange.get_global_range()[I];
LocalSize[I] = ExecutionRange.get_local_range()[I];
GlobalOffset[I] = ExecutionRange.get_offset()[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

void set(int Dims_, sycl::nd_range<3> ExecutionRange) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = ExecutionRange.get_global_range()[I];
LocalSize[I] = ExecutionRange.get_local_range()[I];
GlobalOffset[I] = ExecutionRange.get_offset()[I];
NumWorkGroups[I] = 0;
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

template <int Dims_> void setNumWorkGroups(sycl::range<Dims_> N) {
for (int I = 0; I < Dims_; ++I) {
GlobalSize[I] = 0;
// '0' is a mark to adjust before kernel launch when there is enough info:
LocalSize[I] = 0;
GlobalOffset[I] = 0;
NumWorkGroups[I] = N[I];
}
setNDRangeLeftover(Dims_);
Dims = Dims_;
}

template <int Dims_> void setClusterDimensions(sycl::range<Dims_> N) {
if (Dims_ != Dims) {
throw std::runtime_error(
"Dimensionality of cluster, global and local ranges must be same");
}

for (int I = 0; I < Dims_; ++I) {
ClusterDimensions[I] = N[I];
}
}

sycl::range<3> GlobalSize;
sycl::range<3> LocalSize;
sycl::id<3> GlobalOffset;
/// Number of workgroups, used to record the number of workgroups from the
/// simplest form of parallel_for_work_group. If set, all other fields must be
/// zero
sycl::range<3> NumWorkGroups;
sycl::range<3> ClusterDimensions{1, 1, 1};
size_t Dims;
/// Type of the command group.
/// NOTE: Changing the values of any of these enumerators is an API-break.
enum class CGType : unsigned int {
None = 0,
Kernel = 1,
CopyAccToPtr = 2,
CopyPtrToAcc = 3,
CopyAccToAcc = 4,
Barrier = 5,
BarrierWaitlist = 6,
Fill = 7,
UpdateHost = 8,
CopyUSM = 10,
FillUSM = 11,
PrefetchUSM = 12,
CodeplayHostTask = 14,
AdviseUSM = 15,
Copy2DUSM = 16,
Fill2DUSM = 17,
Memset2DUSM = 18,
CopyToDeviceGlobal = 19,
CopyFromDeviceGlobal = 20,
ReadWriteHostPipe = 21,
ExecCommandBuffer = 22,
CopyImage = 23,
SemaphoreWait = 24,
SemaphoreSignal = 25,
ProfilingTag = 26,
};

template <typename, typename T> struct check_fn_signature {
Expand Down Expand Up @@ -236,8 +153,6 @@ runKernelWithArg(KernelType KernelName, ArgType Arg) {
// The pure virtual class aimed to store lambda/functors of any type.
class HostKernelBase {
public:
// The method executes lambda stored using NDRange passed.
virtual void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) = 0;
// Return pointer to the lambda object.
// Used to extract captured variables.
virtual char *getPtr() = 0;
Expand All @@ -255,177 +170,9 @@ class HostKernel : public HostKernelBase {

public:
HostKernel(KernelType Kernel) : MKernel(Kernel) {}
void call(const NDRDescT &NDRDesc, HostProfilingInfo *HPI) override {
// adjust ND range for serial host:
NDRDescT AdjustedRange = NDRDesc;

if (NDRDesc.GlobalSize[0] == 0 && NDRDesc.NumWorkGroups[0] != 0) {
// This is a special case - NDRange information is not complete, only the
// desired number of work groups is set by the user. Choose work group
// size (LocalSize), calculate the missing NDRange characteristics
// needed to invoke the kernel and adjust the NDRange descriptor
// accordingly. For some devices the work group size selection requires
// access to the device's properties, hence such late "adjustment".
range<3> WGsize{1, 1, 1}; // no better alternative for serial host?
AdjustedRange.set(NDRDesc.Dims,
nd_range<3>(NDRDesc.NumWorkGroups * WGsize, WGsize));
}
// If local size for host is not set explicitly, let's adjust it to 1,
// so an exception for zero local size is not thrown.
if (AdjustedRange.LocalSize[0] == 0)
for (size_t I = 0; I < AdjustedRange.Dims; ++I)
AdjustedRange.LocalSize[I] = 1;
if (HPI)
HPI->start();
runOnHost(AdjustedRange);
if (HPI)
HPI->end();
}

char *getPtr() override { return reinterpret_cast<char *>(&MKernel); }

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, void>>
runOnHost(const NDRDescT &) {
runKernelWithoutArg(MKernel);
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, sycl::id<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> Offset;
sycl::range<Dims> Stride(
InitializedVal<Dims, range>::template get<1>()); // initialized to 1
sycl::range<Dims> UpperBound(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
Range[I] = NDRDesc.GlobalSize[I];
Offset[I] = NDRDesc.GlobalOffset[I];
UpperBound[I] = Range[I] + Offset[I];
}

detail::NDLoop<Dims>::iterate(
/*LowerBound=*/Offset, Stride, UpperBound,
[&](const sycl::id<Dims> &ID) {
sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, ID, Offset);

runKernelWithArg<const sycl::id<Dims> &>(MKernel, ID);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, item<Dims, /*Offset=*/false>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::id<Dims> ID;
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I)
Range[I] = NDRDesc.GlobalSize[I];

detail::NDLoop<Dims>::iterate(Range, [&](const sycl::id<Dims> ID) {
sycl::item<Dims, /*Offset=*/false> Item =
IDBuilder::createItem<Dims, false>(Range, ID);
sycl::item<Dims, /*Offset=*/true> ItemWithOffset = Item;

runKernelWithArg<sycl::item<Dims, /*Offset=*/false>>(MKernel, Item);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, item<Dims, /*Offset=*/true>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> Range(InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> Offset;
sycl::range<Dims> Stride(
InitializedVal<Dims, range>::template get<1>()); // initialized to 1
sycl::range<Dims> UpperBound(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
Range[I] = NDRDesc.GlobalSize[I];
Offset[I] = NDRDesc.GlobalOffset[I];
UpperBound[I] = Range[I] + Offset[I];
}

detail::NDLoop<Dims>::iterate(
/*LowerBound=*/Offset, Stride, UpperBound,
[&](const sycl::id<Dims> &ID) {
sycl::item<Dims, /*Offset=*/true> Item =
IDBuilder::createItem<Dims, true>(Range, ID, Offset);

runKernelWithArg<sycl::item<Dims, /*Offset=*/true>>(MKernel, Item);
});
}

template <class ArgT = KernelArgType>
typename std::enable_if_t<std::is_same_v<ArgT, nd_item<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> GroupSize(InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::exception(make_error_code(errc::nd_range),
"Invalid local size for global size");
GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
sycl::range<Dims> GlobalSize(
InitializedVal<Dims, range>::template get<0>());
sycl::id<Dims> GlobalOffset;
for (int I = 0; I < Dims; ++I) {
GlobalOffset[I] = NDRDesc.GlobalOffset[I];
LocalSize[I] = NDRDesc.LocalSize[I];
GlobalSize[I] = NDRDesc.GlobalSize[I];
}

detail::NDLoop<Dims>::iterate(GroupSize, [&](const id<Dims> &GroupID) {
sycl::group<Dims> Group = IDBuilder::createGroup<Dims>(
GlobalSize, LocalSize, GroupSize, GroupID);

detail::NDLoop<Dims>::iterate(LocalSize, [&](const id<Dims> &LocalID) {
id<Dims> GlobalID =
GroupID * id<Dims>{LocalSize} + LocalID + GlobalOffset;
const sycl::item<Dims, /*Offset=*/true> GlobalItem =
IDBuilder::createItem<Dims, true>(GlobalSize, GlobalID,
GlobalOffset);
const sycl::item<Dims, /*Offset=*/false> LocalItem =
IDBuilder::createItem<Dims, false>(LocalSize, LocalID);
const sycl::nd_item<Dims> NDItem =
IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);

runKernelWithArg<const sycl::nd_item<Dims>>(MKernel, NDItem);
});
});
}

template <typename ArgT = KernelArgType>
std::enable_if_t<std::is_same_v<ArgT, sycl::group<Dims>>>
runOnHost(const NDRDescT &NDRDesc) {
sycl::range<Dims> NGroups(InitializedVal<Dims, range>::template get<0>());

for (int I = 0; I < Dims; ++I) {
if (NDRDesc.LocalSize[I] == 0 ||
NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
throw sycl::exception(make_error_code(errc::nd_range),
"Invalid local size for global size");
NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
}

sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
sycl::range<Dims> GlobalSize(
InitializedVal<Dims, range>::template get<0>());
for (int I = 0; I < Dims; ++I) {
LocalSize[I] = NDRDesc.LocalSize[I];
GlobalSize[I] = NDRDesc.GlobalSize[I];
}
detail::NDLoop<Dims>::iterate(NGroups, [&](const id<Dims> &GroupID) {
sycl::group<Dims> Group =
IDBuilder::createGroup<Dims>(GlobalSize, LocalSize, NGroups, GroupID);
runKernelWithArg<sycl::group<Dims>>(MKernel, Group);
});
}

~HostKernel() = default;
};

Expand Down
1 change: 0 additions & 1 deletion sycl/include/sycl/ext/oneapi/memcpy2d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
//===----------------------------------------------------------------------===//
#pragma once

#include <sycl/detail/host_task_impl.hpp>
#include <sycl/handler.hpp>
#include <sycl/queue.hpp>
#include <sycl/usm/usm_enums.hpp>
Expand Down
8 changes: 1 addition & 7 deletions sycl/include/sycl/group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,13 +673,7 @@ template <int Dimensions = 1> class __SYCL_TYPE(group) group {
friend class detail::Builder;
group(const range<Dimensions> &G, const range<Dimensions> &L,
const range<Dimensions> GroupRange, const id<Dimensions> &I)
: globalRange(G), localRange(L), groupRange(GroupRange), index(I) {
// Make sure local range divides global without remainder:
__SYCL_ASSERT(((G % L).size() == 0) &&
"global range is not multiple of local");
__SYCL_ASSERT((((G / L) - GroupRange).size() == 0) &&
"inconsistent group constructor arguments");
}
: globalRange(G), localRange(L), groupRange(GroupRange), index(I) {}
};
} // namespace _V1
} // namespace sycl
Loading

0 comments on commit 0c368b1

Please sign in to comment.