From b5fd0077859183607be0e67910eff1a8b5ca9547 Mon Sep 17 00:00:00 2001
From: Stuart Adams <stuart.adams@codeplay.com>
Date: Thu, 12 Dec 2019 14:49:51 +0000
Subject: [PATCH 1/2] [PI][CUDA] Implementation of piEventSetCallback with
 tests

Signed-off-by: Stuart Adams <stuart.adams@codeplay.com>
Signed-off-by: Steffen Larsen <steffen.larsen@codeplay.com>
Signed-off-by: Ruyman Reyes <ruyman.reyes@codeplay.com>
---
 sycl/plugins/cuda/pi_cuda.cpp                 | 92 +++++++++---------
 sycl/plugins/cuda/pi_cuda.hpp                 | 94 +++++++++++++++++--
 .../basic_tests/buffer/buffer_dev_to_dev.cpp  |  3 -
 sycl/test/scheduler/DataMovement.cpp          |  1 -
 sycl/test/scheduler/MultipleDevices.cpp       |  3 -
 sycl/unittests/pi/EventTest.cpp               | 82 ++++++++++++++++
 6 files changed, 218 insertions(+), 57 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 52d87d6818756..1f174bcf80049 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -149,6 +149,8 @@ pi_result _pi_event::start() {
   }
 
   isStarted_ = true;
+  // let observers know that the event is "submitted"
+  trigger_callback(get_execution_status());
   return result;
 }
 
@@ -195,6 +197,22 @@ pi_result _pi_event::record() {
 
     try {
       result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));
+
+      result = cuda_piEventRetain(this);
+      try {
+        result = PI_CHECK_ERROR(cuLaunchHostFunc(
+            cuStream,
+            [](void *userData) {
+              pi_event event = reinterpret_cast<pi_event>(userData);
+              event->set_event_complete();
+              cuda_piEventRelease(event);
+            },
+            this));
+      } catch (...) {
+        // If host function fails to enqueue we must release the event here
+        result = cuda_piEventRelease(this);
+        throw;
+      }
     } catch (pi_result error) {
       result = error;
     }
@@ -215,6 +233,7 @@ pi_result _pi_event::wait() {
   if (is_native_event()) {
     try {
       retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
+      isCompleted_ = true;
     } catch (pi_result error) {
       retErr = error;
     }
@@ -226,30 +245,12 @@ pi_result _pi_event::wait() {
     retErr = PI_SUCCESS;
   }
 
-  return retErr;
-}
-
-pi_event_status _pi_event::get_execution_status() const noexcept {
+  auto is_success = retErr == PI_SUCCESS;
+  auto status = is_success ? get_execution_status() : pi_int32(retErr);
 
-  if (!is_recorded()) {
-    return PI_EVENT_SUBMITTED;
-  }
-
-  if (is_native_event()) {
-    // native event status
-
-    auto status = cuEventQuery(get());
-    if (status == CUDA_ERROR_NOT_READY) {
-      return PI_EVENT_RUNNING;
-    } else if (status != CUDA_SUCCESS) {
-      cl::sycl::detail::pi::die("Invalid CUDA event status");
-    }
-    return PI_EVENT_COMPLETE;
-  } else {
-    // user event status
+  trigger_callback(status);
 
-    return is_completed() ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING;
-  }
+  return retErr;
 }
 
 // iterates over the event wait list, returns correct pi_result error codes.
@@ -2516,24 +2517,21 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
 
   switch (param_name) {
   case PI_EVENT_INFO_COMMAND_QUEUE:
-    return getInfo<pi_queue>(param_value_size, param_value,
-                             param_value_size_ret, event->get_queue());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_queue());
   case PI_EVENT_INFO_COMMAND_TYPE:
-    return getInfo<pi_command_type>(param_value_size, param_value,
-                                    param_value_size_ret,
-                                    event->get_command_type());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_command_type());
   case PI_EVENT_INFO_REFERENCE_COUNT:
-    return getInfo<pi_uint32>(param_value_size, param_value,
-                              param_value_size_ret,
-                              event->get_reference_count());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_reference_count());
   case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
-    return getInfo<pi_event_status>(param_value_size, param_value,
-                                    param_value_size_ret,
-                                    event->get_execution_status());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   static_cast<pi_event_status>(event->get_execution_status()));
   }
   case PI_EVENT_INFO_CONTEXT:
-    return getInfo<pi_context>(param_value_size, param_value,
-                               param_value_size_ret, event->get_context());
+    return getInfo(param_value_size, param_value, param_value_size_ret,
+                   event->get_context());
   default:
     PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
   }
@@ -2568,13 +2566,21 @@ pi_result cuda_piEventGetProfilingInfo(
   return {};
 }
 
-pi_result cuda_piEventSetCallback(
-    pi_event event, pi_int32 command_exec_callback_type,
-    void (*pfn_notify)(pi_event event, pi_int32 event_command_status,
-                       void *user_data),
-    void *user_data) {
-  cl::sycl::detail::pi::die("cuda_piEventSetCallback not implemented");
-  return {};
+pi_result cuda_piEventSetCallback(pi_event event,
+                                  pi_int32 command_exec_callback_type,
+                                  pfn_notify notify, void *user_data) {
+
+  assert(event);
+  assert(notify);
+  assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
+         command_exec_callback_type == PI_EVENT_RUNNING ||
+         command_exec_callback_type == PI_EVENT_COMPLETE);
+  event_callback callback(pi_event_status(command_exec_callback_type), notify,
+                          user_data);
+
+  event->set_event_callback(callback);
+
+  return PI_SUCCESS;
 }
 
 pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
@@ -2587,7 +2593,7 @@ pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
   }
 
   if (execution_status == PI_EVENT_COMPLETE) {
-    return event->set_user_event_complete();
+    return event->set_event_complete();
   } else if (execution_status < 0) {
     // TODO: A negative integer value causes all enqueued commands that wait
     // on this user event to be terminated.
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index fe0fda38bd3c7..ba70aa6aaee95 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -235,6 +235,39 @@ struct _pi_queue {
   pi_uint32 get_reference_count() const noexcept { return refCount_; }
 };
 
+typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
+                           void *userData);
+
+class event_callback {
+public:
+  void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {
+
+    auto validParameters = callback_ && event;
+
+    // As a pi_event_status value approaches 0, it gets closer to completion.
+    // If the calling pi_event's status is less than or equal to the event
+    // status the user is interested in, invoke the callback anyway. The event
+    // will have passed through that state anyway.
+    auto validStatus = currentEventStatus <= observedEventStatus_;
+
+    if (validParameters && validStatus) {
+
+      callback_(event, currentEventStatus, userData_);
+    }
+  }
+
+  event_callback(pi_event_status status, pfn_notify callback, void *userData)
+      : observedEventStatus_{status}, callback_{callback}, userData_{userData} {
+  }
+
+  pi_event_status get_status() const noexcept { return observedEventStatus_; }
+
+private:
+  pi_event_status observedEventStatus_;
+  pfn_notify callback_;
+  void *userData_;
+};
+
 class _pi_event {
 public:
   using native_type = CUevent;
@@ -247,18 +280,39 @@ class _pi_event {
 
   native_type get() const noexcept { return evEnd_; };
 
-  pi_result set_user_event_complete() noexcept {
+  pi_result set_event_complete() noexcept {
 
     if (isCompleted_) {
       return PI_INVALID_OPERATION;
     }
 
-    if (is_user_event()) {
-      isRecorded_ = true;
-      isCompleted_ = true;
-      return PI_SUCCESS;
+    isRecorded_ = true;
+    isCompleted_ = true;
+
+    trigger_callback(get_execution_status());
+
+    return PI_SUCCESS;
+  }
+
+  void trigger_callback(pi_int32 status) {
+
+    std::vector<event_callback> callbacks;
+
+    // Here we move all callbacks into local variable before we call them.
+    // This is a defensive maneuver; if any of the callbacks attempt to
+    // add additional callbacks, we will end up in a bad spot. Our mutex
+    // will be locked twice and the vector will be modified as it is being
+    // iterated over! By moving everything locally, we can call all of these
+    // callbacks and let them modify the original vector without much worry.
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      event_callbacks_.swap(callbacks);
+    }
+
+    for (auto &event_callback : callbacks) {
+      event_callback.trigger_callback(this, status);
     }
-    return PI_INVALID_EVENT;
   }
 
   pi_queue get_queue() const noexcept { return queue_; }
@@ -273,7 +327,27 @@ class _pi_event {
 
   bool is_started() const noexcept { return isStarted_; }
 
-  pi_event_status get_execution_status() const noexcept;
+  pi_int32 get_execution_status() const noexcept {
+
+    if (!is_recorded()) {
+      return PI_EVENT_SUBMITTED;
+    }
+
+    if (!is_completed()) {
+      return PI_EVENT_RUNNING;
+    }
+    return PI_EVENT_COMPLETE;
+  }
+
+  void set_event_callback(const event_callback &callback) {
+    auto current_status = get_execution_status();
+    if (current_status <= callback.get_status()) {
+      callback.trigger_callback(this, current_status);
+    } else {
+      std::lock_guard<std::mutex> lock(mutex_);
+      event_callbacks_.emplace_back(callback);
+    }
+  }
 
   pi_context get_context() const noexcept { return context_; };
 
@@ -343,6 +417,12 @@ class _pi_event {
   pi_context context_; // pi_context associated with the event. If this is a
                        // native event, this will be the same context associated
                        // with the queue_ member.
+
+  std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
+                     // a lock-free data structure we can use here.
+  std::vector<event_callback>
+      event_callbacks_; // Callbacks that can be triggered when an event's state
+                        // changes.
 };
 
 struct _pi_program {
diff --git a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
index f7caf5d070d2d..c84a06bbbace4 100644
--- a/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
+++ b/sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
@@ -4,9 +4,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
-// TODO: pi_die: cuda_piEventSetCallback not implemented
-// XFAIL: cuda
-
 //==---------- buffer_dev_to_dev.cpp - SYCL buffer basic test --------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/scheduler/DataMovement.cpp b/sycl/test/scheduler/DataMovement.cpp
index 1de310571c824..f1812bf4dbf29 100644
--- a/sycl/test/scheduler/DataMovement.cpp
+++ b/sycl/test/scheduler/DataMovement.cpp
@@ -1,7 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
 // RUN: %t.out
 //
-// XFAIL: cuda
 //==-------------------------- DataMovement.cpp ----------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/test/scheduler/MultipleDevices.cpp b/sycl/test/scheduler/MultipleDevices.cpp
index d27923929871a..c031f3c8375c1 100644
--- a/sycl/test/scheduler/MultipleDevices.cpp
+++ b/sycl/test/scheduler/MultipleDevices.cpp
@@ -1,9 +1,6 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
 // RUN: %t.out
 
-// TODO: pi_die: cuda_piEventSetCallback not implemented
-// XFAIL: cuda
-
 //===- MultipleDevices.cpp - Test checking multi-device execution --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
diff --git a/sycl/unittests/pi/EventTest.cpp b/sycl/unittests/pi/EventTest.cpp
index 4f48cc688a74b..d3c88b9db97a7 100644
--- a/sycl/unittests/pi/EventTest.cpp
+++ b/sycl/unittests/pi/EventTest.cpp
@@ -89,6 +89,88 @@ TEST_F(DISABLED_EventTest, PICreateEvent) {
             PI_SUCCESS);
 }
 
+constexpr size_t event_type_count = 3;
+static bool triggered_flag[event_type_count] = {false, false, false};
+
+struct callback_user_data {
+  pi_int32 event_type;
+  int index;
+};
+
+void EventCallback(pi_event event, pi_int32 status, void *data) {
+  ASSERT_NE(data, nullptr);
+
+  callback_user_data *pdata = static_cast<callback_user_data *>(data);
+
+#ifndef NDEBUG
+  printf("\tEvent callback %d of type %d triggered\n", pdata->index,
+         pdata->event_type);
+#endif
+
+  triggered_flag[pdata->index] = true;
+}
+
+TEST_F(DISABLED_EventTest, piEventSetCallback) {
+
+  pi_int32 event_callback_types[event_type_count] = {
+      PI_EVENT_SUBMITTED, PI_EVENT_RUNNING, PI_EVENT_COMPLETE};
+
+  callback_user_data user_data[event_type_count];
+
+  // gate event lets us register callbacks before letting the enqueued work be
+  // executed.
+  pi_event gateEvent;
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piEventCreate>(
+                _context, &gateEvent)),
+            PI_SUCCESS);
+
+  constexpr const size_t dataCount = 1000u;
+  std::vector<int> data(dataCount);
+  auto size_in_bytes = data.size() * sizeof(int);
+
+  pi_mem memObj;
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piMemBufferCreate>(
+          _context, PI_MEM_FLAGS_ACCESS_RW, size_in_bytes, nullptr, &memObj)),
+      PI_SUCCESS);
+
+  pi_event syncEvent;
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEnqueueMemBufferWrite>(
+          _queue, memObj, false, 0, size_in_bytes, data.data(), 1, &gateEvent,
+          &syncEvent)),
+      PI_SUCCESS);
+
+  for (size_t i = 0; i < event_type_count; i++) {
+    user_data[i].event_type = event_callback_types[i];
+    user_data[i].index = i;
+    ASSERT_EQ(
+        (Plugins[0].call_nocheck<detail::PiApiKind::piEventSetCallback>(
+            syncEvent, event_callback_types[i], EventCallback, user_data + i)),
+        PI_SUCCESS);
+  }
+
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piEventSetStatus>(
+                gateEvent, PI_EVENT_COMPLETE)),
+            PI_SUCCESS);
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventsWait>(1, &syncEvent)),
+      PI_SUCCESS);
+  ASSERT_EQ((Plugins[0].call_nocheck<detail::PiApiKind::piQueueFinish>(_queue)),
+            PI_SUCCESS);
+
+  for (size_t k = 0; k < event_type_count; ++k) {
+    EXPECT_TRUE(triggered_flag[k]);
+  }
+
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventRelease>(gateEvent)),
+      PI_SUCCESS);
+  ASSERT_EQ(
+      (Plugins[0].call_nocheck<detail::PiApiKind::piEventRelease>(syncEvent)),
+      PI_SUCCESS);
+}
+
 TEST_F(DISABLED_EventTest, piEventGetInfo) {
 
   pi_event foo;

From cd9b90ec674b14e47c6e53fe4f152cee5a0dea01 Mon Sep 17 00:00:00 2001
From: Ruyman Reyes <ruyman@codeplay.com>
Date: Thu, 5 Mar 2020 14:30:24 +0000
Subject: [PATCH 2/2] [SYCL] GlueEvent uses now the correct plugins

The SYCL RT code for GlueEvent calls now
the right plugin to create the event that triggers the
dependency chain.
Renamed variables to clarify the source code and avoid
confusions between Context and Plugin

Signed-off-by: Ruyman Reyes <ruyman@codeplay.com>
---
 sycl/source/detail/scheduler/commands.cpp | 33 +++++++++++++----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 965285ce75ddc..a60de37570689 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -161,7 +161,7 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
   EventImplPtr *Event = (reinterpret_cast<EventImplPtr *>(data));
   RT::PiEvent &EventHandle = (*Event)->getHandleRef();
   const detail::plugin &Plugin = (*Event)->getPlugin();
-  Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, CL_COMPLETE);
+  Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, PI_EVENT_COMPLETE);
   delete (Event);
 }
 
@@ -169,37 +169,42 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
 std::vector<EventImplPtr> Command::prepareEvents(ContextImplPtr Context) {
   std::vector<EventImplPtr> Result;
   std::vector<EventImplPtr> GlueEvents;
-  for (EventImplPtr &Event : MDepsEvents) {
+  for (EventImplPtr &DepEvent : MDepsEvents) {
     // Async work is not supported for host device.
-    if (Event->is_host()) {
-      Event->waitInternal();
+    if (DepEvent->is_host()) {
+      DepEvent->waitInternal();
       continue;
     }
     // The event handle can be null in case of, for example, alloca command,
     // which is currently synchrounious, so don't generate OpenCL event.
-    if (Event->getHandleRef() == nullptr) {
+    if (DepEvent->getHandleRef() == nullptr) {
       continue;
     }
-    ContextImplPtr EventContext = Event->getContextImpl();
-    const detail::plugin &Plugin = Event->getPlugin();
-    // If contexts don't match - connect them using user event
-    if (EventContext != Context && !Context->is_host()) {
+    ContextImplPtr DepEventContext = DepEvent->getContextImpl();
 
+    // If contexts don't match - connect them using user event
+    if (DepEventContext != Context && !Context->is_host()) {
       EventImplPtr GlueEvent(new detail::event_impl());
       GlueEvent->setContextImpl(Context);
+      EventImplPtr *GlueEventCopy =
+          new EventImplPtr(GlueEvent); // To increase the reference count by 1.
+
       RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef();
+      auto Plugin = Context->getPlugin();
+      auto DepPlugin = DepEventContext->getPlugin();
+      // Add an event on the current context that
+      // is triggered when the DepEvent is complete
       Plugin.call<PiApiKind::piEventCreate>(Context->getHandleRef(),
                                             &GlueEventHandle);
-      EventImplPtr *GlueEventCopy =
-          new EventImplPtr(GlueEvent); // To increase the reference count by 1.
-      Plugin.call<PiApiKind::piEventSetCallback>(
-          Event->getHandleRef(), CL_COMPLETE, EventCompletionClbk,
+
+      DepPlugin.call<PiApiKind::piEventSetCallback>(
+          DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk,
           /*void *data=*/(GlueEventCopy));
       GlueEvents.push_back(GlueEvent);
       Result.push_back(std::move(GlueEvent));
       continue;
     }
-    Result.push_back(Event);
+    Result.push_back(DepEvent);
   }
   MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end());
   return Result;