-
Notifications
You must be signed in to change notification settings - Fork 769
[SYCL] Implement basic reduction for parallel_for() accepting nd_range #1585
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
29cdc5e
0596743
f974f85
3b4af65
a409752
a6350ba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -351,6 +351,11 @@ class queue_impl { | |
/// \return a native handle. | ||
pi_native_handle getNative() const; | ||
|
||
/// Stores an event that should be associated with the queue | ||
/// | ||
/// \param Event is the event to be stored | ||
void addEvent(event Event); | ||
|
||
private: | ||
/// Performs command group submission to the queue. | ||
/// | ||
|
@@ -362,8 +367,9 @@ class queue_impl { | |
shared_ptr_class<queue_impl> Self, | ||
const detail::code_location &Loc) { | ||
handler Handler(std::move(Self), MHostQueue); | ||
Handler.saveCodeLoc(Loc); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apply comments from handler.hpp:247 - ordered_queue will require the same changes. |
||
CGF(Handler); | ||
event Event = Handler.finalize(Loc); | ||
event Event = Handler.finalize(); | ||
addEvent(Event); | ||
return Event; | ||
} | ||
|
@@ -377,11 +383,6 @@ class queue_impl { | |
void instrumentationEpilog(void *TelementryEvent, string_class &Name, | ||
int32_t StreamID, uint64_t IId); | ||
|
||
/// Stores an event that should be associated with the queue | ||
/// | ||
/// \param Event is the event to be stored | ||
void addEvent(event Event); | ||
|
||
/// Stores a USM operation event that should be associated with the queue | ||
/// | ||
/// \param Event is the event to be stored | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,12 +13,24 @@ | |
#include <CL/sycl/handler.hpp> | ||
#include <CL/sycl/info/info_desc.hpp> | ||
#include <detail/kernel_impl.hpp> | ||
#include <detail/queue_impl.hpp> | ||
#include <detail/scheduler/scheduler.hpp> | ||
|
||
__SYCL_INLINE_NAMESPACE(cl) { | ||
namespace sycl { | ||
event handler::finalize(const cl::sycl::detail::code_location &Payload) { | ||
sycl::event EventRet; | ||
|
||
void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue, | ||
cl::sycl::event Event) { | ||
Queue->addEvent(std::move(Event)); | ||
} | ||
|
||
event handler::finalize() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ordered_queue sends in the code location information through finalize as a parameter. This change will affect ordered_queue. Please ensure that ordered_queue is correct as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you please show the code that calls finalize() method and passes code_location to it? But I fixed submit_impl, so no additional changes required. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @v-klochkov I looked at it too and the share the queue_impl, so everything looks good. |
||
// This block of code is needed only for reduction implementation. | ||
// It is harmless (does nothing) for everything else. | ||
if (MIsFinalized) | ||
return MLastEvent; | ||
MIsFinalized = true; | ||
|
||
unique_ptr_class<detail::CG> CommandGroup; | ||
switch (MCGType) { | ||
case detail::CG::KERNEL: | ||
|
@@ -29,52 +41,52 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { | |
std::move(MSharedPtrStorage), std::move(MRequirements), | ||
std::move(MEvents), std::move(MArgs), std::move(MKernelName), | ||
std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, | ||
Payload)); | ||
MCodeLoc)); | ||
break; | ||
} | ||
case detail::CG::INTEROP_TASK_CODEPLAY: | ||
CommandGroup.reset(new detail::CGInteropTask( | ||
std::move(MInteropTask), std::move(MArgsStorage), | ||
std::move(MAccStorage), std::move(MSharedPtrStorage), | ||
std::move(MRequirements), std::move(MEvents), MCGType, Payload)); | ||
std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc)); | ||
break; | ||
case detail::CG::COPY_ACC_TO_PTR: | ||
case detail::CG::COPY_PTR_TO_ACC: | ||
case detail::CG::COPY_ACC_TO_ACC: | ||
CommandGroup.reset(new detail::CGCopy( | ||
MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), | ||
std::move(MAccStorage), std::move(MSharedPtrStorage), | ||
std::move(MRequirements), std::move(MEvents), Payload)); | ||
std::move(MRequirements), std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::FILL: | ||
CommandGroup.reset(new detail::CGFill( | ||
std::move(MPattern), MDstPtr, std::move(MArgsStorage), | ||
std::move(MAccStorage), std::move(MSharedPtrStorage), | ||
std::move(MRequirements), std::move(MEvents), Payload)); | ||
std::move(MRequirements), std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::UPDATE_HOST: | ||
CommandGroup.reset(new detail::CGUpdateHost( | ||
MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), | ||
std::move(MSharedPtrStorage), std::move(MRequirements), | ||
std::move(MEvents), Payload)); | ||
std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::COPY_USM: | ||
CommandGroup.reset(new detail::CGCopyUSM( | ||
MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage), | ||
std::move(MAccStorage), std::move(MSharedPtrStorage), | ||
std::move(MRequirements), std::move(MEvents), Payload)); | ||
std::move(MRequirements), std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::FILL_USM: | ||
CommandGroup.reset(new detail::CGFillUSM( | ||
std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage), | ||
std::move(MAccStorage), std::move(MSharedPtrStorage), | ||
std::move(MRequirements), std::move(MEvents), Payload)); | ||
std::move(MRequirements), std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::PREFETCH_USM: | ||
CommandGroup.reset(new detail::CGPrefetchUSM( | ||
MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), | ||
std::move(MSharedPtrStorage), std::move(MRequirements), | ||
std::move(MEvents), Payload)); | ||
std::move(MEvents), MCodeLoc)); | ||
break; | ||
case detail::CG::NONE: | ||
throw runtime_error("Command group submitted without a kernel or a " | ||
|
@@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) { | |
detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( | ||
std::move(CommandGroup), std::move(MQueue)); | ||
|
||
EventRet = detail::createSyclObjFromImpl<event>(Event); | ||
return EventRet; | ||
MLastEvent = detail::createSyclObjFromImpl<event>(Event); | ||
return MLastEvent; | ||
} | ||
|
||
void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out | ||
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out | ||
// RUN: %CPU_RUN_PLACEHOLDER %t.out | ||
// RUN: %GPU_RUN_PLACEHOLDER %t.out | ||
// RUN: %ACC_RUN_PLACEHOLDER %t.out | ||
|
||
// This test performs basic checks of parallel_for(nd_range, reduction, func) | ||
// with reduction and conditional increment of the reduction variable. | ||
|
||
#include <CL/sycl.hpp> | ||
#include <cassert> | ||
|
||
using namespace cl::sycl; | ||
|
||
template <typename T, class BinaryOperation> | ||
void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity, | ||
BinaryOperation BOp, size_t N) { | ||
ExpectedOut = Identity; | ||
auto In = InBuf.template get_access<access::mode::write>(); | ||
for (int I = 0; I < N; ++I) { | ||
if (std::is_same<BinaryOperation, std::multiplies<T>>::value) | ||
In[I] = 1 + (((I % 37) == 0) ? 1 : 0); | ||
else | ||
In[I] = I + 1 + 1.1; | ||
|
||
if (I < 2) | ||
ExpectedOut = BOp(ExpectedOut, 99); | ||
else if (I % 3) | ||
ExpectedOut = BOp(ExpectedOut, In[I]); | ||
else | ||
; // do nothing. | ||
} | ||
}; | ||
|
||
template <typename T, int Dim, class BinaryOperation> | ||
class SomeClass; | ||
|
||
template <typename T> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could create a header file for definition of this class, since it is used in several tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed. I'd be tempted to rename it something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I created a new reduction_utils.hpp header and renamed the type. Thank you. |
||
struct Vec { | ||
Vec() : X(0), Y(0) {} | ||
Vec(T X, T Y) : X(X), Y(Y) {} | ||
Vec(T V) : X(V), Y(V) {} | ||
bool operator==(const Vec &P) const { | ||
return P.X == X && P.Y == Y; | ||
} | ||
bool operator!=(const Vec &P) const { | ||
return !(*this == P); | ||
} | ||
T X; | ||
T Y; | ||
}; | ||
template <typename T> | ||
bool operator==(const Vec<T> &A, const Vec<T> &B) { | ||
return A.X == B.X && A.Y == B.Y; | ||
} | ||
template <typename T> | ||
std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) { | ||
return OS << "(" << P.X << ", " << P.Y << ")"; | ||
} | ||
|
||
template <class T> | ||
struct VecPlus { | ||
using P = Vec<T>; | ||
P operator()(const P &A, const P &B) const { | ||
return P(A.X + B.X, A.Y + B.Y); | ||
} | ||
}; | ||
|
||
template <typename T, int Dim, class BinaryOperation> | ||
void test(T Identity, size_t WGSize, size_t NWItems) { | ||
buffer<T, 1> InBuf(NWItems); | ||
buffer<T, 1> OutBuf(1); | ||
|
||
// Initialize. | ||
BinaryOperation BOp; | ||
T CorrectOut; | ||
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems); | ||
|
||
// Compute. | ||
queue Q; | ||
Q.submit([&](handler &CGH) { | ||
auto In = InBuf.template get_access<access::mode::read>(CGH); | ||
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer> | ||
Out(OutBuf, CGH); | ||
auto Redu = intel::reduction(Out, Identity, BOp); | ||
|
||
range<1> GlobalRange(NWItems); | ||
range<1> LocalRange(WGSize); | ||
nd_range<1> NDRange(GlobalRange, LocalRange); | ||
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>( | ||
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) { | ||
size_t I = NDIt.get_global_linear_id(); | ||
if (I < 2) | ||
Sum.combine(T(99)); | ||
else if (I % 3) | ||
Sum.combine(In[I]); | ||
else | ||
; // do nothing. | ||
}); | ||
}); | ||
|
||
// Check correctness. | ||
auto Out = OutBuf.template get_access<access::mode::read>(); | ||
T ComputedOut = *(Out.get_pointer()); | ||
if (ComputedOut != CorrectOut) { | ||
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n"; | ||
std::cout << "Computed value: " << ComputedOut | ||
<< ", Expected value: " << CorrectOut << "\n"; | ||
assert(0 && "Wrong value."); | ||
} | ||
} | ||
|
||
int main() { | ||
test<int, 0, intel::plus<int>>(0, 2, 2); | ||
test<int, 1, intel::plus<int>>(0, 7, 7); | ||
test<int, 0, intel::plus<int>>(0, 2, 64); | ||
test<short, 1, intel::plus<short>>(0, 16, 256); | ||
|
||
std::cout << "Test passed\n"; | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems to be an unrelated change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was done by clang-format