PaddlePaddle · MingMingShangTian · Jul 19, 2021 · Apr 14, 2021 · Apr 21, 2021 · Apr 21, 2021
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
 
 namespace paddle {
 namespace platform {
@@ -117,5 +118,98 @@ class MemEvent {
   std::string annotation_;
 };
 
+class CudaEvent {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+ public:
+  CudaEvent() {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreateWithFlags(&event_, flags_);
+#else
+    cudaEventCreateWithFlags(&event_, flags_);
+#endif
+  }
+
+  CudaEvent(unsigned int flags) : flags_(flags) {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreateWithFlags(&event_, flags_);
+#else
+    cudaEventCreateWithFlags(&event_, flags_);
+#endif
+  }
+
+  void Record(paddle::platform::stream::CUDAStream& stream) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, stream.raw_stream()));
+#endif
+  }
+
+  bool Query() {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
+    gpuError_t err = cudaEventQuery(event_);
+    if (err == cudaSuccess) {
+      return true;
+    }
+    if (err == cudaErrorNotReady) {
+      return false;
+    }
+#endif
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    return false;
+  }
+
+  void Synchronize() {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventSynchronize(event_));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventSynchronize(event_));
+#endif
+  }
+  gpuEvent_t GetRawCudaEvent() { return event_; }
+
+ private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
+  unsigned int flags_ = cudaEventDefault;
+#endif
+  gpuEvent_t event_;
+#endif
+};
+
+static unsigned int get_cuda_flags(bool enable_timing, bool blocking,
+                                   bool interprocess) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags =
+      (blocking ? hipEventBlockingSync : hipEventDefault) |
+      (enable_timing ? hipEventDefault : hipEventDisableTiming) |
+      (interprocess ? hipEventInterprocess : hipEventDefault);
+  return flags;
+#else
+  unsigned int flags =
+      (blocking ? cudaEventBlockingSync : cudaEventDefault) |
+      (enable_timing ? cudaEventDefault : cudaEventDisableTiming) |
+      (interprocess ? cudaEventInterprocess : cudaEventDefault);
+  return flags;
+#endif
+
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Paddle is not compiled with CUDA. Cannot get the cuda event flags."));
+  return 0;
+#endif
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -95,6 +96,28 @@ void CUDAStream::Wait() const {
   PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
 }
 
+CUDAStream* get_current_stream(int deviceId) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (deviceId == -1) {
+    deviceId = platform::GetCurrentDeviceId();
+  }
+
+  auto& pool = platform::DeviceContextPool::Instance();
+
+  platform::Place device = CUDAPlace(deviceId);
+
+  auto stream = static_cast<platform::CUDADeviceContext*>(pool.Get(device))
+                    ->context()
+                    ->Stream()
+                    .get();
+  return stream;
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Paddle is not compiled with CUDA. Cannot visit cuda current stream."));
+  return nullptr;
+#endif
+}
+
 }  // namespace stream
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
@@ -33,8 +33,9 @@ enum class Priority : uint8_t {
   kHigh = 0x1,
   kNormal = 0x2,
 };
-
+#endif
 class CUDAStream final {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  public:
   CUDAStream() = default;
   explicit CUDAStream(const Place& place,
@@ -93,6 +94,37 @@ class CUDAStream final {
 #endif
   void Destroy();
 
+  bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    hipError_t err = hipStreamQuery(stream_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
+    cudaError_t err = cudaStreamQuery(stream_);
+    if (err == cudaSuccess) {
+      return true;
+    }
+    if (err == cudaErrorNotReady) {
+      return false;
+    }
+#endif
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(err);
+    return false;
+  }
+
+  void Synchronize() const {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#endif
+  }
+
  private:
   Place place_;
 #ifdef PADDLE_WITH_HIP
@@ -102,11 +134,11 @@ class CUDAStream final {
 #endif
   Priority priority_{Priority::kNormal};
   std::unique_ptr<StreamCallbackManager<gpuStream_t>> callback_manager_;
-
+#endif
   DISABLE_COPY_AND_ASSIGN(CUDAStream);
 };
 
-#endif
+CUDAStream* get_current_stream(int deviceId);
 
 }  // namespace stream
 }  // namespace platform

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
@@ -57,7 +57,8 @@ set(PYBIND_SRCS
   inference_api.cc
   compatible.cc
   io.cc
-  generator_py.cc)
+  generator_py.cc
+  cuda_streams_py.cc)
 
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)