Skip to content

Commit dac0398

Browse files
committed
[SYCL] Do not store last event for in-order queues
unless Host Tasks are used. Without Host Tasks, we can just rely on UR for ordering. Having no last event means that ext_oneapi_get_last_event() needs to submit a barrier to return an event to the user. Similarly, ext_oneapi_submit_barrier() now always submits a barrier, even for in-order queues. Whenever Host Tasks are used we need to start recording all events. This is needed because of how kernel submission synchronizes with Host Tasks. With a following scenario: q.host_task(); q.submit_kernel(); q.host_task(): The kernel won't even be submitted to UR until the first Host Task completes. To properly synchronize the second Host Task we need to keep the event describing kernel submission.
1 parent 7b8996e commit dac0398

15 files changed

+113
-1163
lines changed

sycl/source/detail/queue_impl.cpp

+33-26
Original file line numberDiff line numberDiff line change
@@ -282,17 +282,21 @@ event queue_impl::memcpyFromDeviceGlobal(
282282
DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
283283
}
284284

285-
sycl::detail::optional<event> queue_impl::getLastEvent() {
285+
sycl::detail::optional<event>
286+
queue_impl::getLastEvent(const std::shared_ptr<queue_impl> &Self) {
286287
// The external event is required to finish last if set, so it is considered
287288
// the last event if present.
288289
if (std::optional<event> ExternalEvent = MInOrderExternalEvent.read())
289290
return ExternalEvent;
290291

291292
std::lock_guard<std::mutex> Lock{MMutex};
292-
if (MGraph.expired() && !MDefaultGraphDeps.LastEventPtr)
293+
if (MEmpty)
293294
return std::nullopt;
294-
if (MDiscardEvents)
295-
return createDiscardedEvent();
295+
if (MGraph.expired() && !MDefaultGraphDeps.LastEventPtr) {
296+
assert(!MHostTaskMode);
297+
// We insert a marker to represent an event at end.
298+
return detail::createSyclObjFromImpl<event>(insertMarkerEvent(Self));
299+
}
296300
if (!MGraph.expired() && MExtGraphDeps.LastEventPtr)
297301
return detail::createSyclObjFromImpl<event>(MExtGraphDeps.LastEventPtr);
298302
return detail::createSyclObjFromImpl<event>(MDefaultGraphDeps.LastEventPtr);
@@ -305,7 +309,6 @@ void queue_impl::addEvent(const event &Event) {
305309
if (Cmd != nullptr && EImpl->getHandle() == nullptr &&
306310
!EImpl->isDiscarded()) {
307311
std::weak_ptr<event_impl> EventWeakPtr{EImpl};
308-
std::lock_guard<std::mutex> Lock{MMutex};
309312
MEventsWeak.push_back(std::move(EventWeakPtr));
310313
}
311314
}
@@ -339,9 +342,16 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
339342

340343
HandlerImpl->MEventMode = SubmitInfo.EventMode();
341344

342-
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc());
345+
std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
346+
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc(), Lock);
343347

344-
addEvent(Event);
348+
if (isInOrder() && !shouldRecordLastEvent() && Streams.empty()) {
349+
// NOP
350+
} else {
351+
addEvent(Event);
352+
}
353+
354+
Lock.unlock();
345355

346356
const auto &EventImpl = detail::getSyclObjImpl(Event);
347357
for (auto &Stream : Streams) {
@@ -395,10 +405,13 @@ event queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
395405

396406
HandlerImpl->MEventMode = SubmitInfo.EventMode();
397407

398-
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc());
408+
std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
399409

410+
auto Event = finalizeHandler(Handler, SubmitInfo.PostProcessorFunc(), Lock);
400411
addEvent(Event);
401412

413+
Lock.unlock();
414+
402415
const auto &EventImpl = detail::getSyclObjImpl(Event);
403416
for (auto &Stream : Streams) {
404417
// We don't want stream flushing to be blocking operation that is why submit
@@ -460,21 +473,13 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
460473
// handler rather than by-passing the scheduler.
461474
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
462475
ExpandedDepEvents, MContext)) {
463-
if ((MDiscardEvents || !CallerNeedsEvent) &&
464-
supportsDiscardingPiEvents()) {
476+
if (!CallerNeedsEvent && supportsDiscardingPiEvents() &&
477+
!shouldRecordLastEvent()) {
465478
NestedCallsTracker tracker;
466479
MemOpFunc(MemOpArgs..., getUrEvents(ExpandedDepEvents),
467480
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
468481

469-
event DiscardedEvent = createDiscardedEvent();
470-
if (isInOrder()) {
471-
// Store the discarded event for proper in-order dependency tracking.
472-
auto &EventToStoreIn = MGraph.expired()
473-
? MDefaultGraphDeps.LastEventPtr
474-
: MExtGraphDeps.LastEventPtr;
475-
EventToStoreIn = detail::getSyclObjImpl(DiscardedEvent);
476-
}
477-
return DiscardedEvent;
482+
return createDiscardedEvent();
478483
}
479484

480485
event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
@@ -500,7 +505,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
500505
}
501506
}
502507

503-
if (isInOrder()) {
508+
if (shouldRecordLastEvent()) {
504509
auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
505510
: MExtGraphDeps.LastEventPtr;
506511
EventToStoreIn = EventImpl;
@@ -745,14 +750,11 @@ bool queue_impl::ext_oneapi_empty() const {
745750
// the status of the last event.
746751
if (isInOrder() && !MDiscardEvents) {
747752
std::lock_guard<std::mutex> Lock(MMutex);
748-
// If there is no last event we know that no work has been submitted, so it
749-
// must be trivially empty.
750-
if (!MDefaultGraphDeps.LastEventPtr)
751-
return true;
752-
// Otherwise, check if the last event is finished.
753+
assert((MDefaultGraphDeps.LastEventPtr != nullptr) == MHostTaskMode);
753754
// Note that we fall back to the backend query if the event was discarded,
754755
// which may happend despite the queue not being a discard event queue.
755-
if (!MDefaultGraphDeps.LastEventPtr->isDiscarded())
756+
if (MDefaultGraphDeps.LastEventPtr &&
757+
!MDefaultGraphDeps.LastEventPtr->isDiscarded())
756758
return MDefaultGraphDeps.LastEventPtr
757759
->get_info<info::event::command_execution_status>() ==
758760
info::event_command_status::complete;
@@ -765,6 +767,11 @@ bool queue_impl::ext_oneapi_empty() const {
765767
if (!IsReady)
766768
return false;
767769

770+
// If got here, it means that LastEventPtr is nullptr (so no possible Host
771+
// Tasks) and there is nothing executing on the device.
772+
if (isInOrder())
773+
return true;
774+
768775
// We may have events like host tasks which are not submitted to the backend
769776
// queue so we need to get their status separately.
770777
std::lock_guard<std::mutex> Lock(MMutex);

sycl/source/detail/queue_impl.hpp

+54-34
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ class queue_impl {
180180
#endif
181181
}
182182

183-
sycl::detail::optional<event> getLastEvent();
183+
sycl::detail::optional<event>
184+
getLastEvent(const std::shared_ptr<queue_impl> &Self);
184185

185186
private:
186187
void queue_impl_interop(ur_queue_handle_t UrQueue) {
@@ -720,40 +721,35 @@ class queue_impl {
720721
}
721722

722723
template <typename HandlerType = handler>
723-
event finalizeHandlerInOrder(HandlerType &Handler) {
724-
// Accessing and changing of an event isn't atomic operation.
725-
// Hence, here is the lock for thread-safety.
726-
std::lock_guard<std::mutex> Lock{MMutex};
724+
event finalizeHandlerInOrder(HandlerType &Handler,
725+
std::unique_lock<std::mutex> &Lock) {
726+
Lock.lock();
727727

728728
auto &EventToBuildDeps = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
729729
: MExtGraphDeps.LastEventPtr;
730730

731-
// This dependency is needed for the following purposes:
732-
// - host tasks are handled by the runtime and cannot be implicitly
733-
// synchronized by the backend.
734-
// - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
735-
// by a host task. This dependency allows to build the enqueue order in
736-
// the RT but will not be passed to the backend. See getPIEvents in
737-
// Command.
738-
if (EventToBuildDeps) {
739-
// In the case where the last event was discarded and we are to run a
740-
// host_task, we insert a barrier into the queue and use the resulting
741-
// event as the dependency for the host_task.
742-
// Note that host_task events can never be discarded, so this will not
743-
// insert barriers between host_task enqueues.
744-
if (EventToBuildDeps->isDiscarded() &&
745-
Handler.getType() == CGType::CodeplayHostTask)
746-
EventToBuildDeps = insertHelperBarrier(Handler);
731+
if (Handler.getType() == CGType::CodeplayHostTask) {
732+
if (!MHostTaskMode && MGraph.expired() && !MEmpty) {
733+
assert(EventToBuildDeps == nullptr);
734+
// since we don't store any events, insert a barrier to ensure proper
735+
// ordering with device execution
736+
auto barrierEvent = insertHelperBarrier(Handler);
737+
Handler.depends_on(barrierEvent);
738+
}
739+
740+
MHostTaskMode = true;
741+
}
747742

743+
if (EventToBuildDeps && Handler.getType() != CGType::AsyncAlloc) {
748744
// depends_on after an async alloc is explicitly disallowed. Async alloc
749745
// handles in order queue dependencies preemptively, so we skip them.
750746
// Note: This could be improved by moving the handling of dependencies
751747
// to before calling the CGF.
752-
if (!EventToBuildDeps->isDiscarded() &&
753-
!(Handler.getType() == CGType::AsyncAlloc))
754-
Handler.depends_on(EventToBuildDeps);
748+
Handler.depends_on(EventToBuildDeps);
755749
}
756750

751+
MEmpty = false;
752+
757753
// If there is an external event set, add it as a dependency and clear it.
758754
// We do not need to hold the lock as MLastEventMtx will ensure the last
759755
// event reflects the corresponding external event dependence as well.
@@ -762,15 +758,22 @@ class queue_impl {
762758
Handler.depends_on(*ExternalEvent);
763759

764760
auto EventRet = Handler.finalize();
765-
EventToBuildDeps = getSyclObjImpl(EventRet);
761+
762+
if (shouldRecordLastEvent()) {
763+
EventToBuildDeps = getSyclObjImpl(EventRet);
764+
}
766765

767766
return EventRet;
768767
}
769768

770769
template <typename HandlerType = handler>
771-
event finalizeHandlerOutOfOrder(HandlerType &Handler) {
770+
event finalizeHandlerOutOfOrder(HandlerType &Handler,
771+
std::unique_lock<std::mutex> &Lock) {
772772
const CGType Type = getSyclObjImpl(Handler)->MCGType;
773-
std::lock_guard<std::mutex> Lock{MMutex};
773+
Lock.lock();
774+
775+
MEmpty = false;
776+
774777
// The following code supports barrier synchronization if host task is
775778
// involved in the scenario. Native barriers cannot handle host task
776779
// dependency so in the case where some commands were not enqueued
@@ -807,7 +810,8 @@ class queue_impl {
807810
template <typename HandlerType = handler>
808811
event finalizeHandlerPostProcess(
809812
HandlerType &Handler,
810-
const optional<SubmitPostProcessF> &PostProcessorFunc) {
813+
const optional<SubmitPostProcessF> &PostProcessorFunc,
814+
std::unique_lock<std::mutex> &Lock) {
811815
bool IsKernel = Handler.getType() == CGType::Kernel;
812816
bool KernelUsesAssert = false;
813817

@@ -818,8 +822,8 @@ class queue_impl {
818822
ProgramManager::getInstance().kernelUsesAssert(
819823
Handler.MKernelName.data());
820824

821-
auto Event = MIsInorder ? finalizeHandlerInOrder(Handler)
822-
: finalizeHandlerOutOfOrder(Handler);
825+
auto Event = MIsInorder ? finalizeHandlerInOrder(Handler, Lock)
826+
: finalizeHandlerOutOfOrder(Handler, Lock);
823827

824828
auto &PostProcess = *PostProcessorFunc;
825829

@@ -831,12 +835,13 @@ class queue_impl {
831835
// template is needed for proper unit testing
832836
template <typename HandlerType = handler>
833837
event finalizeHandler(HandlerType &Handler,
834-
const optional<SubmitPostProcessF> &PostProcessorFunc) {
838+
const optional<SubmitPostProcessF> &PostProcessorFunc,
839+
std::unique_lock<std::mutex> &Lock) {
835840
if (PostProcessorFunc) {
836-
return finalizeHandlerPostProcess(Handler, PostProcessorFunc);
841+
return finalizeHandlerPostProcess(Handler, PostProcessorFunc, Lock);
837842
} else {
838-
return MIsInorder ? finalizeHandlerInOrder(Handler)
839-
: finalizeHandlerOutOfOrder(Handler);
843+
return MIsInorder ? finalizeHandlerInOrder(Handler, Lock)
844+
: finalizeHandlerOutOfOrder(Handler, Lock);
840845
}
841846
}
842847

@@ -1006,6 +1011,21 @@ class queue_impl {
10061011

10071012
const bool MIsInorder;
10081013

1014+
// Specifies whether this queue uses host tasks. If yes, then event
1015+
// from all operations need to be recorded for proper synchronization.
1016+
bool MHostTaskMode = false;
1017+
1018+
bool shouldRecordLastEvent() const {
1019+
// For in-order queues we rely on UR queue ordering.
1020+
// We only need to keep the event if host task are used
1021+
// (to ensure proper ordering).
1022+
1023+
// TODO: do not record last event for graphs as well
1024+
return MIsInorder && (MHostTaskMode || !MGraph.expired());
1025+
}
1026+
1027+
bool MEmpty = true;
1028+
10091029
std::vector<EventImplPtr> MStreamsServiceEvents;
10101030
std::mutex MStreamsServiceEventsMutex;
10111031

sycl/source/queue.cpp

+4-23
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
302302
assert(!QueueImpl->hasCommandGraph() &&
303303
"Should not be called in on graph recording.");
304304

305-
sycl::detail::optional<event> LastEvent = QueueImpl->getLastEvent();
305+
sycl::detail::optional<event> LastEvent = QueueImpl->getLastEvent(QueueImpl);
306306
if (LastEvent)
307307
return *LastEvent;
308308

@@ -321,10 +321,7 @@ getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
321321
event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
322322
if (is_in_order() && !impl->hasCommandGraph() && !impl->MDiscardEvents &&
323323
!impl->MIsProfilingEnabled) {
324-
event InOrderLastEvent = getBarrierEventForInorderQueueHelper(impl);
325-
// If the last event was discarded, fall back to enqueuing a barrier.
326-
if (!detail::getSyclObjImpl(InOrderLastEvent)->isDiscarded())
327-
return InOrderLastEvent;
324+
return getBarrierEventForInorderQueueHelper(impl);
328325
}
329326

330327
return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc);
@@ -349,10 +346,7 @@ event queue::ext_oneapi_submit_barrier(const std::vector<event> &WaitList,
349346
});
350347
if (is_in_order() && !impl->hasCommandGraph() && !impl->MDiscardEvents &&
351348
!impl->MIsProfilingEnabled && AllEventsEmptyOrNop) {
352-
event InOrderLastEvent = getBarrierEventForInorderQueueHelper(impl);
353-
// If the last event was discarded, fall back to enqueuing a barrier.
354-
if (!detail::getSyclObjImpl(InOrderLastEvent)->isDiscarded())
355-
return InOrderLastEvent;
349+
return getBarrierEventForInorderQueueHelper(impl);
356350
}
357351

358352
return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); },
@@ -432,20 +426,7 @@ sycl::detail::optional<event> queue::ext_oneapi_get_last_event_impl() const {
432426
make_error_code(errc::invalid),
433427
"ext_oneapi_get_last_event() can only be called on in-order queues.");
434428

435-
sycl::detail::optional<event> LastEvent = impl->getLastEvent();
436-
437-
// If there was no last event, the queue is yet to have any work submitted and
438-
// we return a std::nullopt.
439-
if (!LastEvent)
440-
return std::nullopt;
441-
442-
// If the last event was discarded or a NOP, we insert a marker to represent
443-
// an event at end.
444-
auto LastEventImpl = detail::getSyclObjImpl(*LastEvent);
445-
if (LastEventImpl->isDiscarded() || LastEventImpl->isNOP())
446-
LastEvent =
447-
detail::createSyclObjFromImpl<event>(impl->insertMarkerEvent(impl));
448-
return LastEvent;
429+
return impl->getLastEvent(impl);
449430
}
450431

451432
void queue::ext_oneapi_set_external_event(const event &external_event) {

sycl/test-e2e/Graph/lit.local.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
# https://github.com/intel/llvm/issues/17165
22
if 'windows' in config.available_features:
33
config.unsupported_features += ['arch-intel_gpu_bmg_g21']
4+
config.unsupported_features += ['level_zero_v2_adapter']

sycl/test-e2e/InOrderEventsExt/get_last_event.cpp

+7-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,13 @@ int Check(const sycl::queue &Q, const char *CheckName, const F &CheckFunc) {
3636
<< std::endl;
3737
return 1;
3838
}
39-
if (*E != *LastEvent) {
40-
std::cout << "Failed " << CheckName << std::endl;
39+
if (LastEvent->get_info<sycl::info::event::command_execution_status>() ==
40+
sycl::info::event_command_status::complete &&
41+
E->get_info<sycl::info::event::command_execution_status>() !=
42+
sycl::info::event_command_status::complete) {
43+
std::cout << "ext_oneapi_get_last_event() returned an event that is "
44+
"complete, but the event returned by CheckFunc() is not."
45+
<< std::endl;
4146
return 1;
4247
}
4348
return 0;

sycl/test-e2e/InorderQueue/in_order_ext_oneapi_submit_barrier.cpp

+1-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
// RUN: %{build} -o %t.out
2-
// RUN: env SYCL_UR_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s
3-
4-
// Test to check that we don't insert unnecessary urEnqueueEventsWaitWithBarrier
5-
// calls if queue is in-order and wait list is empty.
6-
7-
// CHECK-NOT: <--- urEnqueueEventsWaitWithBarrier
2+
// RUN: %{run} %t.out
83

94
#include <condition_variable>
105

@@ -26,13 +21,11 @@ int main() {
2621
cgh.single_task<class kernel1>([=]() { *Res += 9; });
2722
});
2823
auto BarrierEvent1 = Q.ext_oneapi_submit_barrier();
29-
assert(BarrierEvent1 == Event1);
3024
auto Event2 = Q.submit([&](sycl::handler &cgh) {
3125
cgh.single_task<class kernel2>([=]() { *Res *= 2; });
3226
});
3327

3428
auto BarrierEvent2 = Q.ext_oneapi_submit_barrier();
35-
assert(BarrierEvent2 == Event2);
3629
BarrierEvent2.wait();
3730

3831
// Check that kernel events are completed after waiting for barrier event.
@@ -50,7 +43,6 @@ int main() {
5043
auto Event1 = Q.submit(
5144
[&](sycl::handler &CGH) { CGH.host_task([&] { *Res += 1; }); });
5245
auto BarrierEvent1 = Q.ext_oneapi_submit_barrier();
53-
assert(Event1 == BarrierEvent1);
5446
auto Event2 = Q.submit([&](sycl::handler &CGH) { CGH.fill(Res, 10, 1); });
5547

5648
Q.wait();

0 commit comments

Comments
 (0)