correct sync behavior for XPU distributed training (#47882)

* correct sync behavior for XPU distributed training XPU support event mechanism similar to cuda event, so it is advisable to use an event to sync compute/comm streams for performance. However this mechanism is never fully tested, and inconsistent loss/ending_epochs are reported. Therefore, this PR replaces event sync with stream waiting as a temporary solution. * remove compile warning
PaddlePaddle · Nov 18, 2022 · aafa982 · aafa982
1 parent 1fb4d90
commit aafa982
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 34 deletions.
diff --git a/paddle/fluid/distributed/collective/BKCLTools.h b/paddle/fluid/distributed/collective/BKCLTools.h
@@ -77,23 +77,11 @@ class XPUEventManager {
                           device_index_));
 
     platform::XPUDeviceGuard guard(device_index_);
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream()));
+    // TODO(zhangxiaoci) temporary solution: xpu::event seems buggy
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream()));
   }
 
-  void Block(const XPUContext& ctx) const {
-    if (is_created_) {
-      auto device_index = ctx.GetPlace().device;
-      PADDLE_ENFORCE_EQ(device_index,
-                        device_index_,
-                        platform::errors::PreconditionNotMet(
-                            "XPUContext's device %d does not match"
-                            "Event's device %d",
-                            device_index,
-                            device_index_));
-      platform::XPUDeviceGuard guard(device_index_);
-      PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_));
-    }
-  }
+  void Block(const XPUContext& ctx) const {}
 
  private:
   bool is_created_{false};

diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
@@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) {
 
   if (barrier_) {
     // If we use the work to do barrier, we should block cpu
+
+    // TODO(zhangxiaoci) There is no such function that can sync entire device
+    // for xpu (for now), so all we can do is sync whatever stream that we know
+    // and hope for the best. Note that for correctness the communication stream
+    // needs to be in sync mode.
     platform::XPUDeviceGuard guard(place_.GetDeviceId());
     xpu_wait();
+    calc_ctx->Wait();
   }
   return true;
 }

diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
@@ -64,7 +64,7 @@ struct XPUContext::Impl {
       // manually destroy XPUStream here until xpu::api integrates this work
       // into Context dtor
       xpu_wait(context_->xpu_stream);
-      PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream));
+      xpu_stream_destroy(context_->xpu_stream);
       context_->xpu_stream = nullptr;
       xpu::destroy_context(context_);
       context_ = nullptr;

diff --git a/paddle/phi/kernels/xpu/concat_and_split.cc → ...i/kernels/xpu/concat_and_split_functor.cc b/paddle/phi/kernels/xpu/concat_and_split.cc → ...i/kernels/xpu/concat_and_split_functor.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 
 namespace phi {
@@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> {
                                   reinterpret_cast<XPUType*>(output->data<T>()),
                                   xdims_list,
                                   axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        paddle::platform::errors::External(
-            "XPU API return wrong value[%d %s], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r,
-            XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat");
   }
 };
 
@@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> {
         xdims_list,
         split_list,
         axis);
-    PADDLE_ENFORCE_EQ(
-        r,
-        XPU_SUCCESS,
-        paddle::platform::errors::External(
-            "XPU API return wrong value[%d %s], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r,
-            XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "split");
   }
 };