@@ -180,7 +180,8 @@ class queue_impl {
180
180
#endif
181
181
}
182
182
183
- sycl::detail::optional<event> getLastEvent ();
183
+ sycl::detail::optional<event>
184
+ getLastEvent (const std::shared_ptr<queue_impl> &Self);
184
185
185
186
private:
186
187
void queue_impl_interop (ur_queue_handle_t UrQueue) {
@@ -720,40 +721,35 @@ class queue_impl {
720
721
}
721
722
722
723
template <typename HandlerType = handler>
723
- event finalizeHandlerInOrder (HandlerType &Handler) {
724
- // Accessing and changing of an event isn't atomic operation.
725
- // Hence, here is the lock for thread-safety.
726
- std::lock_guard<std::mutex> Lock{MMutex};
724
+ event finalizeHandlerInOrder (HandlerType &Handler,
725
+ std::unique_lock<std::mutex> &Lock) {
726
+ Lock.lock ();
727
727
728
728
auto &EventToBuildDeps = MGraph.expired () ? MDefaultGraphDeps.LastEventPtr
729
729
: MExtGraphDeps.LastEventPtr ;
730
730
731
- // This dependency is needed for the following purposes:
732
- // - host tasks are handled by the runtime and cannot be implicitly
733
- // synchronized by the backend.
734
- // - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
735
- // by a host task. This dependency allows to build the enqueue order in
736
- // the RT but will not be passed to the backend. See getPIEvents in
737
- // Command.
738
- if (EventToBuildDeps) {
739
- // In the case where the last event was discarded and we are to run a
740
- // host_task, we insert a barrier into the queue and use the resulting
741
- // event as the dependency for the host_task.
742
- // Note that host_task events can never be discarded, so this will not
743
- // insert barriers between host_task enqueues.
744
- if (EventToBuildDeps->isDiscarded () &&
745
- Handler.getType () == CGType::CodeplayHostTask)
746
- EventToBuildDeps = insertHelperBarrier (Handler);
731
+ if (Handler.getType () == CGType::CodeplayHostTask) {
732
+ if (!MHostTaskMode && MGraph.expired () && !MEmpty) {
733
+ assert (EventToBuildDeps == nullptr );
734
+ // since we don't store any events, insert a barrier to ensure proper
735
+ // ordering with device execution
736
+ auto barrierEvent = insertHelperBarrier (Handler);
737
+ Handler.depends_on (barrierEvent);
738
+ }
739
+
740
+ MHostTaskMode = true ;
741
+ }
747
742
743
+ if (EventToBuildDeps && Handler.getType () != CGType::AsyncAlloc) {
748
744
// depends_on after an async alloc is explicitly disallowed. Async alloc
749
745
// handles in order queue dependencies preemptively, so we skip them.
750
746
// Note: This could be improved by moving the handling of dependencies
751
747
// to before calling the CGF.
752
- if (!EventToBuildDeps->isDiscarded () &&
753
- !(Handler.getType () == CGType::AsyncAlloc))
754
- Handler.depends_on (EventToBuildDeps);
748
+ Handler.depends_on (EventToBuildDeps);
755
749
}
756
750
751
+ MEmpty = false ;
752
+
757
753
// If there is an external event set, add it as a dependency and clear it.
758
754
// We do not need to hold the lock as MLastEventMtx will ensure the last
759
755
// event reflects the corresponding external event dependence as well.
@@ -762,15 +758,22 @@ class queue_impl {
762
758
Handler.depends_on (*ExternalEvent);
763
759
764
760
auto EventRet = Handler.finalize ();
765
- EventToBuildDeps = getSyclObjImpl (EventRet);
761
+
762
+ if (shouldRecordLastEvent ()) {
763
+ EventToBuildDeps = getSyclObjImpl (EventRet);
764
+ }
766
765
767
766
return EventRet;
768
767
}
769
768
770
769
template <typename HandlerType = handler>
771
- event finalizeHandlerOutOfOrder (HandlerType &Handler) {
770
+ event finalizeHandlerOutOfOrder (HandlerType &Handler,
771
+ std::unique_lock<std::mutex> &Lock) {
772
772
const CGType Type = getSyclObjImpl (Handler)->MCGType ;
773
- std::lock_guard<std::mutex> Lock{MMutex};
773
+ Lock.lock ();
774
+
775
+ MEmpty = false ;
776
+
774
777
// The following code supports barrier synchronization if host task is
775
778
// involved in the scenario. Native barriers cannot handle host task
776
779
// dependency so in the case where some commands were not enqueued
@@ -807,7 +810,8 @@ class queue_impl {
807
810
template <typename HandlerType = handler>
808
811
event finalizeHandlerPostProcess (
809
812
HandlerType &Handler,
810
- const optional<SubmitPostProcessF> &PostProcessorFunc) {
813
+ const optional<SubmitPostProcessF> &PostProcessorFunc,
814
+ std::unique_lock<std::mutex> &Lock) {
811
815
bool IsKernel = Handler.getType () == CGType::Kernel;
812
816
bool KernelUsesAssert = false ;
813
817
@@ -818,8 +822,8 @@ class queue_impl {
818
822
ProgramManager::getInstance ().kernelUsesAssert (
819
823
Handler.MKernelName .data ());
820
824
821
- auto Event = MIsInorder ? finalizeHandlerInOrder (Handler)
822
- : finalizeHandlerOutOfOrder (Handler);
825
+ auto Event = MIsInorder ? finalizeHandlerInOrder (Handler, Lock )
826
+ : finalizeHandlerOutOfOrder (Handler, Lock );
823
827
824
828
auto &PostProcess = *PostProcessorFunc;
825
829
@@ -831,12 +835,13 @@ class queue_impl {
831
835
// template is needed for proper unit testing
832
836
template <typename HandlerType = handler>
833
837
event finalizeHandler (HandlerType &Handler,
834
- const optional<SubmitPostProcessF> &PostProcessorFunc) {
838
+ const optional<SubmitPostProcessF> &PostProcessorFunc,
839
+ std::unique_lock<std::mutex> &Lock) {
835
840
if (PostProcessorFunc) {
836
- return finalizeHandlerPostProcess (Handler, PostProcessorFunc);
841
+ return finalizeHandlerPostProcess (Handler, PostProcessorFunc, Lock );
837
842
} else {
838
- return MIsInorder ? finalizeHandlerInOrder (Handler)
839
- : finalizeHandlerOutOfOrder (Handler);
843
+ return MIsInorder ? finalizeHandlerInOrder (Handler, Lock )
844
+ : finalizeHandlerOutOfOrder (Handler, Lock );
840
845
}
841
846
}
842
847
@@ -1006,6 +1011,21 @@ class queue_impl {
1006
1011
1007
1012
const bool MIsInorder;
1008
1013
1014
+ // Specifies whether this queue uses host tasks. If yes, then event
1015
+ // from all operations need to be recorded for proper synchronization.
1016
+ bool MHostTaskMode = false ;
1017
+
1018
+ bool shouldRecordLastEvent () const {
1019
+ // For in-order queues we rely on UR queue ordering.
1020
+ // We only need to keep the event if host task are used
1021
+ // (to ensure proper ordering).
1022
+
1023
+ // TODO: do not record last event for graphs as well
1024
+ return MIsInorder && (MHostTaskMode || !MGraph.expired ());
1025
+ }
1026
+
1027
+ bool MEmpty = true ;
1028
+
1009
1029
std::vector<EventImplPtr> MStreamsServiceEvents;
1010
1030
std::mutex MStreamsServiceEventsMutex;
1011
1031
0 commit comments