Skip to content

Commit

Permalink
correct sync behavior for XPU distributed training (#47882)
Browse files Browse the repository at this point in the history
* correct sync behavior for XPU distributed training

XPU support event mechanism similar to cuda event, so it is advisable to
use an event to sync compute/comm streams for performance. However this
mechanism is never fully tested, and inconsistent loss/ending_epochs are
reported. Therefore, this PR replaces event sync with stream waiting as
a temporary solution.

* remove compile warning
  • Loading branch information
XiaociZhang authored Nov 18, 2022
1 parent 1fb4d90 commit aafa982
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 34 deletions.
18 changes: 3 additions & 15 deletions paddle/fluid/distributed/collective/BKCLTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,23 +77,11 @@ class XPUEventManager {
device_index_));

platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event_, ctx.stream()));
// TODO(zhangxiaoci) temporary solution: xpu::event seems buggy
PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(ctx.stream()));
}

void Block(const XPUContext& ctx) const {
if (is_created_) {
auto device_index = ctx.GetPlace().device;
PADDLE_ENFORCE_EQ(device_index,
device_index_,
platform::errors::PreconditionNotMet(
"XPUContext's device %d does not match"
"Event's device %d",
device_index,
device_index_));
platform::XPUDeviceGuard guard(device_index_);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(ctx.stream(), event_));
}
}
void Block(const XPUContext& ctx) const {}

private:
bool is_created_{false};
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,14 @@ bool ProcessGroupBKCL::BKCLTask::Wait(std::chrono::milliseconds timeout) {

if (barrier_) {
// If we use the work to do barrier, we should block cpu

// TODO(zhangxiaoci) There is no such function that can sync entire device
// for xpu (for now), so all we can do is sync whatever stream that we know
// and hope for the best. Note that for correctness the communication stream
// needs to be in sync mode.
platform::XPUDeviceGuard guard(place_.GetDeviceId());
xpu_wait();
calc_ctx->Wait();
}
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/backends/xpu/xpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ struct XPUContext::Impl {
// manually destroy XPUStream here until xpu::api integrates this work
// into Context dtor
xpu_wait(context_->xpu_stream);
PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(context_->xpu_stream));
xpu_stream_destroy(context_->xpu_stream);
context_->xpu_stream = nullptr;
xpu::destroy_context(context_);
context_ = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"

#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"

namespace phi {
Expand Down Expand Up @@ -67,14 +65,7 @@ class ConcatFunctor<XPUContext, T> {
reinterpret_cast<XPUType*>(output->data<T>()),
xdims_list,
axis);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "concat");
}
};

Expand Down Expand Up @@ -126,14 +117,7 @@ class SplitFunctor<XPUContext, T> {
xdims_list,
split_list,
axis);
PADDLE_ENFORCE_EQ(
r,
XPU_SUCCESS,
paddle::platform::errors::External(
"XPU API return wrong value[%d %s], please check whether "
"Baidu Kunlun Card is properly installed.",
r,
XPUAPIErrorMsg[r]));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "split");
}
};

Expand Down

0 comments on commit aafa982

Please sign in to comment.