PaddlePaddle
diff --git a/‎paddle/fluid/pybind/tensor.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/pybind/tensor.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/pybind/xpu_streams_py.cc‎
Lines changed: 256 additions & 15 deletions b/‎paddle/fluid/pybind/xpu_streams_py.cc‎
Lines changed: 256 additions & 15 deletions
diff --git a/‎paddle/fluid/pybind/xpu_streams_py.h‎
Lines changed: 28 additions & 1 deletion b/‎paddle/fluid/pybind/xpu_streams_py.h‎
Lines changed: 28 additions & 1 deletion
@@ -902,7 +902,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
              const auto &device_id =
                  paddle::platform::GetXPUCurrentDeviceId();
              auto stream = paddle::platform::get_current_stream(device_id);
-             xpu_wait(stream);
+             xpu_wait(stream->raw_stream());
              int type_idx = static_cast<int>(self.type());
              size_t data_size = self.numel() *
                  framework::SizeOfType(
 
@@ -33,19 +33,55 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id) {
-  if (device_id == -1) {
-    device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
-  }
+phi::XPUStreamHandle* get_current_stream(int device_id) {
   auto place = phi::XPUPlace(device_id);
   auto *dev_ctx = static_cast<phi::XPUContext *>(
       phi::DeviceContextPool::Instance().Get(place));
-  dev_ctx->Wait();
-  return dev_ctx->stream();
+  auto handle = new phi::XPUStreamHandle();
+  return handle;
 }
 
+// phi::XPUContext* get_context(int device_id) {
+//   // int curr_device_id = platform::GetXPUCurrentDeviceId();
+//   auto place_tmp = phi::XPUPlace(device_id > -1 ? device_id : platform::GetXPUCurrentDeviceId());
+//   phi::XPUContext *dev_ctx = static_cast<phi::XPUContext *>(
+//   phi::DeviceContextPool::Instance().Get(place_tmp));
+
+//   return dev_ctx;
+// }
+
+phi::XPUStreamHandle* set_current_stream(int idx) {
+  int device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
+  auto original_stream = get_current_stream(device_id);
+  auto place = phi::XPUPlace(device_id);
+  auto *dev_ctx = static_cast<phi::XPUContext *>(
+    phi::DeviceContextPool::Instance().Get(place));
+  dev_ctx->SetCurrentStream(idx);
+  // return original_stream;
+  return original_stream;
+}
+
+
+// XPUStream get_stream_by_idx
+// ::paddle::phi::XPUCUDAStream get_current_cuda_stream(int device_id) {
+//   if (device_id == -1) {
+//     device_id = phi::backends::xpu::GetXPUCurrentDeviceId();
+//   }
+//   auto place = phi::XPUPlace(device_id);
+
+// }
+
 #endif
 }  // namespace platform
+
+
+// namespace phi{
+// void phi::XPUEventHandle::synchronize() {
+//   auto *dev_ctx = paddle::platform::get_context();
+//   dev_ctx->StreamWaitEvent(event_, 0);
+// }
+// }
+
 namespace pybind {
 void BindXpuStream(py::module *m_ptr) {
   auto &m = *m_ptr;
@@ -69,7 +105,7 @@ void BindXpuStream(py::module *m_ptr) {
 #endif
   });
   m.def(
-      "_get_current_stream",
+      "_xpu_get_current_stream",
       [](int device_id) {
 #ifdef PADDLE_WITH_XPU
         if (device_id == -1) {
@@ -79,11 +115,18 @@ void BindXpuStream(py::module *m_ptr) {
         return platform::get_current_stream(device_id);
 #else
         PADDLE_THROW(
-            common::errors::Unavailable("Paddle is not compiled with CUDA. "
+            common::errors::Unavailable("Paddle is not compiled with XPU. "
                                         "Cannot visit device synchronize."));
 #endif
       },
       py::return_value_policy::reference);
+#ifdef PADDLE_WITH_XPU
+  m.def("_xpu_set_current_stream",
+    [](int stream_id) {
+      return platform::set_current_stream(stream_id);
+    },
+    py::return_value_policy::reference);
+#endif
   m.def("_device_synchronize", [](int device_id) {
 #ifdef PADDLE_WITH_XPU
     if (device_id == -1) {
@@ -101,11 +144,190 @@ void BindXpuStream(py::module *m_ptr) {
   });
 
 #ifdef PADDLE_WITH_XPU
-  py::class_<XPUStream>(m, "XPUStream", R"DOC(
+  py::class_<phi::XPUStreamHandle>(m, "XPUStream", R"DOC(
       The handle of the CUDA stream.
 
       Parameters:
-          device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+          device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
+              If device is None or negative integer, device will be the current device.
+              If device is positive integer, it must less than the device count. Default: None.
+          priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
+              If priority is None, the priority is 2(normal). Default: None.
+
+      Examples:
+          .. code-block:: python
+
+              >>> # doctest: +REQUIRES(env:GPU)
+              >>> import paddle
+              >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1)
+              >>> s2 = paddle.device.xpu.Stream(0, 1)
+              >>> s3 = paddle.device.xpu.Stream()
+
+      )DOC")
+      .def(
+        "__init__",
+        [](phi::XPUStreamHandle &self) {
+            // int curr_device_id = platform::GetXPUCurrentDeviceId();
+            // auto place_tmp = phi::XPUPlace(curr_device_id);
+            // auto *dev_ctx = static_cast<phi::XPUContext *>(
+            //   phi::DeviceContextPool::Instance().Get(place_tmp));
+            auto *dev_ctx = phi::get_xpu_context();
+            // new (&self) phi::XPUStreamHandle(dev_ctx->get_idle_stream());
+            new (&self) phi::XPUStreamHandle();
+            // self.idx = dev_ctx->get_idle_stream();
+            // printf("empty init, idx: %d \n", self.id());
+            // self.id = 
+            // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&self));
+            // if (dev_ctx->stream() == nullptr) {
+            //   dev_ctx->SetStream(self);
+            // }
+        }
+      )
+      .def_property_readonly(
+        "xpu_stream",
+        [](phi::XPUStreamHandle &self) {
+          // printf("call pybind xpu_stream()\n");
+          // return self.raw_stream();
+          return self;
+        }
+      )
+      .def(
+        "wait_stream",
+        [](phi::XPUStreamHandle &self, phi::XPUStreamHandle &other){
+          // XPUEvent event;
+            auto *dev_ctx = phi::get_xpu_context();
+            dev_ctx->StreamWaitStreamInPool(self.id(), other.id());
+            // dev_ctx->StreamWaitStream()
+        }
+      )
+      // .def(
+      //   "__init__",
+      //   [](XPUStream &self, phi::XPUPlace *place) {
+      //     // int device_count = platform::GetXPUDeviceCount();
+      //     auto *dev_ctx = static_cast<phi::XPUContext *>(
+      //       phi::DeviceContextPool::Instance().Get(place_tmp));
+      //       PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&self));
+      //     if (place == nullptr) {
+      //       int curr_device_id = platform::GetXPUCurrentDeviceId();
+      //       auto place_tmp = phi::XPUPlace(curr_device_id);
+      //     }
+
+      //     if (dev_ctx->stream() == nullptr) {
+      //       dev_ctx->SetStream(self);
+      //     }
+      //   }
+      // )
+      .def(
+        "__init__",
+        [](phi::XPUStreamHandle &self, int device) {
+            // int curr_device_id = platform::GetXPUCurrentDeviceId();
+            auto place_tmp = phi::XPUPlace(device);
+            auto *dev_ctx = static_cast<phi::XPUContext *>(
+              phi::DeviceContextPool::Instance().Get(place_tmp));
+            // auto *dev_ctx = platform::get_context();
+            new (&self) phi::XPUStreamHandle();
+            // new (&self) phi::XPUStreamHandle(dev_ctx->get_idle_stream());
+            // new (&self) phi::XPUStreamHandle;
+            // self.idx = dev_ctx->get_idle_stream();
+            // printf("device init, idx: %d \n", self.id());
+
+          // int device_count = platform::GetXPUDeviceCount();
+          // if (device < 0) {
+          //   device = platform::GetXPUCurrentDeviceId();
+          // }
+          // if (device >= device_count) {
+          //   PADDLE_THROW(common::errors::InvalidArgument(
+          //       "The device id  must be inside [0, %d), but input device=%d.",
+          //       device_count,
+          //       device));
+          // }
+          // auto place_tmp = phi::XPUPlace(device);
+          // auto *dev_ctx = static_cast<phi::XPUContext *>(
+          //   phi::DeviceContextPool::Instance().Get(place_tmp));
+          // new (&self) XPUStream;
+          // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&self));
+          // if (dev_ctx->stream() == nullptr) {
+          //   dev_ctx->SetStream(self);
+          // }
+        },
+        py::arg("device") = -1
+      )
+      .def_property_readonly(
+        "place",
+        [](phi::XPUStreamHandle &self) { return phi::XPUPlace(platform::GetXPUCurrentDeviceId()); })
+      .def_property_readonly(
+        "idx",
+        [](phi::XPUStreamHandle &self) { return self.id(); }
+      )
+      // .def(
+      //     "synchronize",
+      //     [](XPUStream &self) { xpu_wait(self); },
+      //     R"DOC(
+      //     Waits for stream tasks to complete.
+
+      //     Examples:
+      //         .. code-block:: python
+
+      //             >>> # doctest: +REQUIRES(env:GPU)
+      //             >>> import paddle
+      //             >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+      //             >>> s.synchronize()
+
+      //     )DOC"
+      //     )
+          ;
+  py::class_<phi::XPUEventHandle>(m, "XPUEvent", R"DOC(
+      The handle of the CUDA event.
+
+      Parameters:
+          enable_timing(bool, optional): Whether the event will measure time. Default: False.
+          blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
+          interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+
+      Examples:
+          .. code-block:: python
+
+              >>> # doctest: +REQUIRES(env:GPU)
+              >>> import paddle
+              >>> event = paddle.device.cuda.Event()
+
+      )DOC")
+      .def(
+        "__init__",
+        [](phi::XPUEventHandle &self) {
+          new (&self) phi::XPUEventHandle();
+        }
+      )
+      .def(
+        "record",
+        [](phi::XPUEventHandle &self, phi::XPUStreamHandle* stream) {
+          auto *dev_ctx = phi::get_xpu_context();
+          XPUStream raw_stream = dev_ctx->get_stream_from_pool(stream->id());
+          int r = xpu_event_record(self.get_event(), raw_stream);
+          PADDLE_ENFORCE_XRE_SUCCESS(r);
+
+        },
+        py::arg("stream") = nullptr
+      )
+      .def(
+        "query",
+        [](phi::XPUEventHandle &self) {return xpu_event_query(self.get_event());}
+      )
+      .def(
+        "synchronize",
+        [](phi::XPUEventHandle &self) {
+          auto *dev_ctx = phi::get_xpu_context();
+          dev_ctx->StreamWaitEvent(self.get_event(), 0);
+          // self.synchronize();
+        }
+      );
+
+
+  py::class_<phi::XPUCUDAStream>(m, "XPUCUDAStream", R"DOC(
+      The handle of the XPU stream.
+
+      Parameters:
+          device(paddle.XPUPlace()|int|None, optional): The device which wanted to allocate the stream.
               If device is None or negative integer, device will be the current device.
               If device is positive integer, it must less than the device count. Default: None.
           priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
@@ -116,14 +338,14 @@ void BindXpuStream(py::module *m_ptr) {
 
               >>> # doctest: +REQUIRES(env:GPU)
               >>> import paddle
-              >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-              >>> s2 = paddle.device.cuda.Stream(0, 1)
-              >>> s3 = paddle.device.cuda.Stream()
+              >>> s1 = paddle.device.xpu.Stream(paddle.XPUPlace(0), 1)
+              >>> s2 = paddle.device.xpu.Stream(0, 1)
+              >>> s3 = paddle.device.xpu.Stream()
 
       )DOC")
       .def(
           "synchronize",
-          [](XPUStream &self) { xpu_wait(self); },
+          [](phi::XPUCUDAStream &self) { self.Synchronize(); },
           R"DOC(
           Waits for stream tasks to complete.
 
@@ -135,7 +357,26 @@ void BindXpuStream(py::module *m_ptr) {
                   >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
                   >>> s.synchronize()
 
-          )DOC");
+          )DOC"
+          )
+      .def(
+          "__init__",
+          [](phi::XPUCUDAStream &self, phi::XPUPlace *place, int priority) {
+            if (priority != 1 && priority != 2) {
+              PADDLE_THROW(common::errors::InvalidArgument(
+                  "Priority should be 1(high) or 2(normal) "));
+            }
+            auto stream_flag = phi::XPUCUDAStream::StreamFlag::kStreamNonBlocking;
+            if (place == nullptr) {
+              int curr_device_id = platform::GetXPUCurrentDeviceId();
+              auto place_tmp = phi::XPUPlace(curr_device_id);
+              new (&self) phi::XPUCUDAStream(place_tmp, priority - 2, stream_flag);
+            } else {
+              new (&self) phi::XPUCUDAStream(*place, priority - 2, stream_flag);
+            }
+
+          }
+      );
 #endif
 }
 }  // namespace pybind
 
@@ -19,8 +19,33 @@
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/core/xpu_cuda_stream.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
+namespace phi {
+// class XPUStreamHandle {
+// public:
+//     int idx;
+//     XPUStreamHandle(int stream_id): idx(stream_id) {};
+// };
+
+// class XPUEventHandle {
+//  public:
+//   XPUEventHandle() {
+//     int r = xpu_event_create(&event_);
+//   PADDLE_ENFORCE_XRE_SUCCESS(r); }
+//   void record(XPUStream stream_) {
+//     PADDLE_ENFORCE_XRE_SUCCESS(xpu_event_record(event_, stream_));
+//   }
+
+//   XPUEvent event() const { return event_; }
+
+//  private:
+//   XPUEvent event_;
+// };
+
+}
+
 #else
 namespace phi {
 class XPUCUDAStream {};
@@ -32,7 +57,9 @@ namespace py = pybind11;
 namespace paddle {
 namespace platform {
 #ifdef PADDLE_WITH_XPU
-XPUStream get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* get_current_stream(int device_id = -1);
+phi::XPUStreamHandle* set_current_stream(int idx) ;
+// phi::XPUContext* get_context(int device_id = -1);
 #endif
 }  // namespace platform
 namespace pybind {