Skip to content

[SYCL] Implement basic reduction for parallel_for() accepting nd_range #1585

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 28, 2020
Merged
7 changes: 2 additions & 5 deletions sycl/include/CL/sycl/builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@ namespace sycl {
#else
namespace __sycl_std = __host_std;
#endif
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
/* ----------------- 4.13.3 Math functions. ---------------------------------*/
// genfloat acos (genfloat x)
template <typename T>
Expand Down Expand Up @@ -731,7 +727,8 @@ detail::enable_if_t<detail::is_geninteger<T>::value, T> clz(T x) __NOEXC {
namespace intel {
// geninteger ctz (geninteger x)
template <typename T>
detail::enable_if_t<detail::is_geninteger<T>::value, T> ctz(T x) __NOEXC {
sycl::detail::enable_if_t<sycl::detail::is_geninteger<T>::value, T>
ctz(T x) __NOEXC {
return __sycl_std::__invoke_ctz<T>(x);
}
} // namespace intel
Expand Down
2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class interop_handler {

public:
using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be an unrelated change

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was done by clang-format


interop_handler(std::vector<ReqToMem> MemObjs, QueueImplPtr Queue)
: MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {}
Expand Down
395 changes: 393 additions & 2 deletions sycl/include/CL/sycl/handler.hpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion sycl/include/CL/sycl/intel/function_pointer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName,
PI_INVALID_OPERATION);
}

return detail::getDeviceFunctionPointerImpl(D, P, FuncName);
return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName);
}
} // namespace intel
} // namespace sycl
Expand Down
26 changes: 26 additions & 0 deletions sycl/include/CL/sycl/intel/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,13 +342,39 @@ class reduction_impl {
"Only scalar/1-element reductions are supported now.");
}

accessor<T, buffer_dim, access::mode::discard_read_write,
access::target::local>
getReadWriteLocalAcc(size_t Size, handler &CGH) {
return accessor<T, buffer_dim, access::mode::discard_read_write,
access::target::local>(Size, CGH);
}

accessor<T, buffer_dim, access::mode::read>
getReadAccToPreviousPartialReds(handler &CGH) const {
CGH.addReduction(MOutBufPtr);
return accessor<T, buffer_dim, access::mode::read>(*MOutBufPtr, CGH);
}

accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber,
handler &CGH) {
if (Size == 1) {
if (RunNumber > 0)
CGH.associateWithHandler(this->MAcc);
return this->MAcc;
}
// Create a new output buffer and return an accessor to it.
MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
CGH.addReduction(MOutBufPtr);
return accessor_type(*MOutBufPtr, CGH);
}
/// User's accessor to where the reduction must be written.
accessor_type MAcc;

private:
/// Identity of the BinaryOperation.
/// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
const T MIdentity;
shared_ptr_class<buffer<T, buffer_dim>> MOutBufPtr;
};

} // namespace detail
Expand Down
13 changes: 7 additions & 6 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,11 @@ class queue_impl {
/// \return a native handle.
pi_native_handle getNative() const;

/// Stores an event that should be associated with the queue
///
/// \param Event is the event to be stored
void addEvent(event Event);

private:
/// Performs command group submission to the queue.
///
Expand All @@ -362,8 +367,9 @@ class queue_impl {
shared_ptr_class<queue_impl> Self,
const detail::code_location &Loc) {
handler Handler(std::move(Self), MHostQueue);
Handler.saveCodeLoc(Loc);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apply comments from handler.hpp:247 - ordered_queue will require the same changes.

CGF(Handler);
event Event = Handler.finalize(Loc);
event Event = Handler.finalize();
addEvent(Event);
return Event;
}
Expand All @@ -377,11 +383,6 @@ class queue_impl {
void instrumentationEpilog(void *TelementryEvent, string_class &Name,
int32_t StreamID, uint64_t IId);

/// Stores an event that should be associated with the queue
///
/// \param Event is the event to be stored
void addEvent(event Event);

/// Stores a USM operation event that should be associated with the queue
///
/// \param Event is the event to be stored
Expand Down
36 changes: 24 additions & 12 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,24 @@
#include <CL/sycl/handler.hpp>
#include <CL/sycl/info/info_desc.hpp>
#include <detail/kernel_impl.hpp>
#include <detail/queue_impl.hpp>
#include <detail/scheduler/scheduler.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
event handler::finalize(const cl::sycl::detail::code_location &Payload) {
sycl::event EventRet;

void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
cl::sycl::event Event) {
Queue->addEvent(std::move(Event));
}

event handler::finalize() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ordered_queue sends in the code location information through finalize as a parameter. This change will affect ordered_queue. Please ensure that ordered_queue is correct as well.

Copy link
Contributor Author

@v-klochkov v-klochkov Apr 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you please show the code that calls finalize() method and passes code_location to it?
I grepped all files in SYCL folder and did not find any calls that would not be fixed.
The only finalize(code_loc) call was in queue_impl::submit_impl(), which is used by ordered_queue.

But I fixed submit_impl, so no additional changes required.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@v-klochkov I looked at it too and the share the queue_impl, so everything looks good.

// This block of code is needed only for reduction implementation.
// It is harmless (does nothing) for everything else.
if (MIsFinalized)
return MLastEvent;
MIsFinalized = true;

unique_ptr_class<detail::CG> CommandGroup;
switch (MCGType) {
case detail::CG::KERNEL:
Expand All @@ -29,52 +41,52 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
std::move(MSharedPtrStorage), std::move(MRequirements),
std::move(MEvents), std::move(MArgs), std::move(MKernelName),
std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
Payload));
MCodeLoc));
break;
}
case detail::CG::INTEROP_TASK_CODEPLAY:
CommandGroup.reset(new detail::CGInteropTask(
std::move(MInteropTask), std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), MCGType, Payload));
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
break;
case detail::CG::COPY_ACC_TO_PTR:
case detail::CG::COPY_PTR_TO_ACC:
case detail::CG::COPY_ACC_TO_ACC:
CommandGroup.reset(new detail::CGCopy(
MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), Payload));
std::move(MRequirements), std::move(MEvents), MCodeLoc));
break;
case detail::CG::FILL:
CommandGroup.reset(new detail::CGFill(
std::move(MPattern), MDstPtr, std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), Payload));
std::move(MRequirements), std::move(MEvents), MCodeLoc));
break;
case detail::CG::UPDATE_HOST:
CommandGroup.reset(new detail::CGUpdateHost(
MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
std::move(MSharedPtrStorage), std::move(MRequirements),
std::move(MEvents), Payload));
std::move(MEvents), MCodeLoc));
break;
case detail::CG::COPY_USM:
CommandGroup.reset(new detail::CGCopyUSM(
MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), Payload));
std::move(MRequirements), std::move(MEvents), MCodeLoc));
break;
case detail::CG::FILL_USM:
CommandGroup.reset(new detail::CGFillUSM(
std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage),
std::move(MAccStorage), std::move(MSharedPtrStorage),
std::move(MRequirements), std::move(MEvents), Payload));
std::move(MRequirements), std::move(MEvents), MCodeLoc));
break;
case detail::CG::PREFETCH_USM:
CommandGroup.reset(new detail::CGPrefetchUSM(
MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage),
std::move(MSharedPtrStorage), std::move(MRequirements),
std::move(MEvents), Payload));
std::move(MEvents), MCodeLoc));
break;
case detail::CG::NONE:
throw runtime_error("Command group submitted without a kernel or a "
Expand All @@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
std::move(CommandGroup), std::move(MQueue));

EventRet = detail::createSyclObjFromImpl<event>(Event);
return EventRet;
MLastEvent = detail::createSyclObjFromImpl<event>(Event);
return MLastEvent;
}

void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
Expand Down
3 changes: 2 additions & 1 deletion sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3232,7 +3232,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
_ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
_ZN2cl4sycl7handler18extractArgsAndReqsEv
_ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE
_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
_ZN2cl4sycl7handler8finalizeEv
_ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
_ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
_ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l
Expand Down
6 changes: 5 additions & 1 deletion sycl/test/abi/symbol_size.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@ int main() {
check_size<device_selector, 8>();
check_size<event, 16>();
check_size<gpu_selector, 8>();
check_size<handler, 472>();
#ifdef _MSC_VER
check_size<handler, 520>();
#else
check_size<handler, 528>();
#endif
check_size<image<1>, 16>();
check_size<kernel, 16>();
check_size<platform, 16>();
Expand Down
121 changes: 121 additions & 0 deletions sycl/test/reduction/reduction_nd_conditional.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(nd_range, reduction, func)
// with reduction and conditional increment of the reduction variable.

#include <CL/sycl.hpp>
#include <cassert>

using namespace cl::sycl;

template <typename T, class BinaryOperation>
void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
BinaryOperation BOp, size_t N) {
ExpectedOut = Identity;
auto In = InBuf.template get_access<access::mode::write>();
for (int I = 0; I < N; ++I) {
if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
else
In[I] = I + 1 + 1.1;

if (I < 2)
ExpectedOut = BOp(ExpectedOut, 99);
else if (I % 3)
ExpectedOut = BOp(ExpectedOut, In[I]);
else
; // do nothing.
}
};

template <typename T, int Dim, class BinaryOperation>
class SomeClass;

template <typename T>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could create a header file for definition of this class, since it is used in several tests

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I'd be tempted to rename it something like CustomType or CustomVec as well, to highlight why you're defining a new class instead of using sycl::vec.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I created a new reduction_utils.hpp header and renamed the type. Thank you.

struct Vec {
Vec() : X(0), Y(0) {}
Vec(T X, T Y) : X(X), Y(Y) {}
Vec(T V) : X(V), Y(V) {}
bool operator==(const Vec &P) const {
return P.X == X && P.Y == Y;
}
bool operator!=(const Vec &P) const {
return !(*this == P);
}
T X;
T Y;
};
template <typename T>
bool operator==(const Vec<T> &A, const Vec<T> &B) {
return A.X == B.X && A.Y == B.Y;
}
template <typename T>
std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
return OS << "(" << P.X << ", " << P.Y << ")";
}

template <class T>
struct VecPlus {
using P = Vec<T>;
P operator()(const P &A, const P &B) const {
return P(A.X + B.X, A.Y + B.Y);
}
};

template <typename T, int Dim, class BinaryOperation>
void test(T Identity, size_t WGSize, size_t NWItems) {
buffer<T, 1> InBuf(NWItems);
buffer<T, 1> OutBuf(1);

// Initialize.
BinaryOperation BOp;
T CorrectOut;
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);

// Compute.
queue Q;
Q.submit([&](handler &CGH) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
Out(OutBuf, CGH);
auto Redu = intel::reduction(Out, Identity, BOp);

range<1> GlobalRange(NWItems);
range<1> LocalRange(WGSize);
nd_range<1> NDRange(GlobalRange, LocalRange);
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
size_t I = NDIt.get_global_linear_id();
if (I < 2)
Sum.combine(T(99));
else if (I % 3)
Sum.combine(In[I]);
else
; // do nothing.
});
});

// Check correctness.
auto Out = OutBuf.template get_access<access::mode::read>();
T ComputedOut = *(Out.get_pointer());
if (ComputedOut != CorrectOut) {
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
std::cout << "Computed value: " << ComputedOut
<< ", Expected value: " << CorrectOut << "\n";
assert(0 && "Wrong value.");
}
}

int main() {
test<int, 0, intel::plus<int>>(0, 2, 2);
test<int, 1, intel::plus<int>>(0, 7, 7);
test<int, 0, intel::plus<int>>(0, 2, 64);
test<short, 1, intel::plus<short>>(0, 16, 256);

std::cout << "Test passed\n";
return 0;
}
Loading