From 44026d41acea604a571b656dddfcab1287097272 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jul 2021 18:30:03 +0800 Subject: [PATCH 1/6] add numeric test=develop --- paddle/fluid/framework/section_worker.cc | 3 + .../operators/collective/c_allreduce_op.h | 99 +++++++++++++++++-- 2 files changed, 94 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index f68ee153e0025..a4f0d3be4aa8c 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -180,6 +180,7 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { fw_step += 1; bw_step += 1; + VLog(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step; } int reserve_bw_send_step = bw_step - 2; @@ -187,8 +188,10 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { while (bw_step < num_microbatches_) { RunBackward(bw_step, gc, unused_vars_); bw_step += 1; + VLog(2) << "micro steps bw_step:" << bw_step; } + VLog(2) << "run update"; RunUpdate(gc, unused_vars_); if (gc) { diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 3a74f551e7a30..4d027d42532f2 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/npu_op_runner.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) @@ -119,13 +120,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { } }; +// return true if found_inf_or_nan or return false; +template +bool CheckNumerics(const framework::ExecutionContext& exe_ctx, + aclrtStream stream, const paddle::framework::Tensor* in) { + auto& dev_ctx = + exe_ctx.template device_context(); + using Tensor = paddle::framework::Tensor; + Tensor out(in->type()); + out.Resize(in->dims()); + out.mutable_data(dev_ctx.GetPlace()); + + bool found_inf_data = false; + + try { + const auto& runner = + NpuOpRunner("CheckNumerics", {*in}, {out}, + {{"message", std::string("check_numberics")}}); + runner.Run(stream); + dev_ctx.Wait(); + } catch (platform::EnforceNotMet& exception) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } catch (...) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } + + return found_inf_data; +} + template class CAllReduceOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + const auto* float_status = ctx.Input("FloatStatus"); auto place = ctx.GetPlace(); HcclDataType dtype = platform::ToHCCLDataType(in->type()); int64_t numel = in->numel(); @@ -141,9 +173,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); aclrtStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); if (ctx.Attr("use_calc_stream")) { - stream = static_cast(dev_ctx)->stream(); + stream = dev_ctx->stream(); } else { stream = comm->stream(); } @@ -171,9 +204,53 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { "Invalid reduce type: %d", red_type)); } - VLOG(3) << "begin hccl allreduce, parameter is: " + VLOG(3) << "hccl allreduce, parameter is: " + << "input num: " << in->dims() << "dtype: " << dtype + << "hccl_red_type: " << hccl_red_type << ", group is: " << group + << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff + << ", out_size:" << out->memory_size() + << ", use_calc_stream:" << ctx.Attr("use_calc_stream") + << ", stream:" << stream; + + framework::Tensor tmp; + tmp.mutable_data({8}, ctx.GetPlace()); + + bool nan_or_inf = false; + bool check_numerics = false; + if (float_status) { + VLOG(4) << "prepare to FoundNanInf"; + nan_or_inf = FoundNanOrInf( + ctx.template device_context(), + dev_ctx->stream(), float_status, &tmp); + } else { + auto d_type = in->type(); + switch (d_type) { + case framework::proto::VarType::FP16: + case framework::proto::VarType::FP32: { + VLOG(4) << "prepare to FoundNanInf"; + check_numerics = CheckNumerics(ctx, dev_ctx->stream(), in); + VLOG(4) << "check_numerics:" << check_numerics; + break; + } + default: + break; + } + } + + if (nan_or_inf || check_numerics) { + T inf = static_cast(std::numeric_limits::infinity()); + VLOG(4) << "fill input data constant inf"; + auto dims = in->dims(); + auto mutable_in = const_cast(in); + FillNpuTensorWithConstant(mutable_in, inf); + mutable_in->Resize(dims); + } + + VLOG(3) << "hccl allreduce, parameter is: " << "input num: " << numel << "dtype: " << dtype - << "hccl_red_type: " << hccl_red_type << ", group is: " << group; + << "hccl_red_type: " << hccl_red_type << ", group is: " << group + << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff + << ", out_size:" << out->memory_size(); PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), @@ -198,7 +275,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); BKCLDataType dtype = platform::ToBKCLDataType(in->type()); int64_t numel = in->numel(); - const void* sendbuff = in->data(); + const void* sendbuff = in->data(); out->Resize(in->dims()); void* recvbuff = out->mutable_data(place); @@ -260,7 +337,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { auto place = ctx.GetPlace(); ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); int64_t numel = in->numel(); - const void* sendbuff = in->data(); + const void* sendbuff = in->data(); out->Resize(in->dims()); void* recvbuff = out->mutable_data(place); @@ -328,6 +405,12 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { "parallel mode, the backward is c_identity which returns itself for " "c_allreduce_sum.") .SetDefault(false); +#if defined(PADDLE_WITH_ASCEND_CL) + AddInput("FloatStatus", + "(Tensor) 1-dim tensor of shape [8], allocated by " + "alloc_float_status op") + .AsDispensable(); +#endif AddComment(string::Sprintf(R"DOC( CAllReduce %s Operator From 30492c1850338c7986f7299f0c3cafd3f4843042 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jul 2021 20:10:21 +0800 Subject: [PATCH 2/6] cleanup test=develop --- .../operators/collective/c_allreduce_op.h | 38 ++++++------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 4d027d42532f2..eaa219c28267a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -157,7 +157,6 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { #if defined(PADDLE_WITH_ASCEND_CL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); - const auto* float_status = ctx.Input("FloatStatus"); auto place = ctx.GetPlace(); HcclDataType dtype = platform::ToHCCLDataType(in->type()); int64_t numel = in->numel(); @@ -215,29 +214,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { framework::Tensor tmp; tmp.mutable_data({8}, ctx.GetPlace()); - bool nan_or_inf = false; bool check_numerics = false; - if (float_status) { - VLOG(4) << "prepare to FoundNanInf"; - nan_or_inf = FoundNanOrInf( - ctx.template device_context(), - dev_ctx->stream(), float_status, &tmp); - } else { - auto d_type = in->type(); - switch (d_type) { - case framework::proto::VarType::FP16: - case framework::proto::VarType::FP32: { - VLOG(4) << "prepare to FoundNanInf"; - check_numerics = CheckNumerics(ctx, dev_ctx->stream(), in); - VLOG(4) << "check_numerics:" << check_numerics; - break; - } - default: - break; + + auto d_type = in->type(); + switch (d_type) { + case framework::proto::VarType::FP16: + case framework::proto::VarType::FP32: { + VLOG(4) << "prepare to FoundNanInf"; + check_numerics = CheckNumerics(ctx, dev_ctx->stream(), in); + VLOG(4) << "check_numerics:" << check_numerics; + break; } + default: + break; } - if (nan_or_inf || check_numerics) { + if (check_numerics) { T inf = static_cast(std::numeric_limits::infinity()); VLOG(4) << "fill input data constant inf"; auto dims = in->dims(); @@ -405,12 +397,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { "parallel mode, the backward is c_identity which returns itself for " "c_allreduce_sum.") .SetDefault(false); -#if defined(PADDLE_WITH_ASCEND_CL) - AddInput("FloatStatus", - "(Tensor) 1-dim tensor of shape [8], allocated by " - "alloc_float_status op") - .AsDispensable(); -#endif AddComment(string::Sprintf(R"DOC( CAllReduce %s Operator From 3e42fd65c4b149a2132edef2418619feffa3e434 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jul 2021 20:11:38 +0800 Subject: [PATCH 3/6] cleanup test=develop --- paddle/fluid/framework/section_worker.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index a4f0d3be4aa8c..146798d557320 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -180,7 +180,7 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { fw_step += 1; bw_step += 1; - VLog(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step; + VLOG(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step; } int reserve_bw_send_step = bw_step - 2; @@ -188,10 +188,10 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { while (bw_step < num_microbatches_) { RunBackward(bw_step, gc, unused_vars_); bw_step += 1; - VLog(2) << "micro steps bw_step:" << bw_step; + VLOG(2) << "micro steps bw_step:" << bw_step; } - VLog(2) << "run update"; + VLOG(2) << "run update"; RunUpdate(gc, unused_vars_); if (gc) { From bf95721f041f65676c06bae769cef02309aab06a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jul 2021 20:38:57 +0800 Subject: [PATCH 4/6] cleanup test=develop --- paddle/fluid/framework/section_worker.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 146798d557320..2b00c5c674fb1 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -164,6 +164,7 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { while (fw_step < startup_steps) { RunForward(fw_step, gc, unused_vars_); fw_step += 1; + VLOG(2) << "micro steps fw_step:" << fw_step; } // 1f1b phase From 091f11fd2044699bcee7ef3ebd0b259e3d56d994 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 28 Jul 2021 21:30:54 +0800 Subject: [PATCH 5/6] cleanup test=develop --- paddle/fluid/framework/section_worker.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 2b00c5c674fb1..5df01e151f805 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -207,6 +207,7 @@ void SectionWorker::Run1F1B(std::unique_ptr &gc) { void SectionWorker::TrainFiles() { VLOG(5) << "begin section_worker TrainFiles"; + VLOG(2) << "mini batch steps:" << batch_id_; int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; From 130a99d8e49f6b6998aa646d7a9242d55d4b1007 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 29 Jul 2021 14:24:22 +0800 Subject: [PATCH 6/6] fix compilation test=develop --- paddle/fluid/operators/collective/c_allreduce_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index eaa219c28267a..3c51c65b07390 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -120,6 +120,7 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { } }; +#if defined(PADDLE_WITH_ASCEND_CL) // return true if found_inf_or_nan or return false; template bool CheckNumerics(const framework::ExecutionContext& exe_ctx, @@ -149,6 +150,7 @@ bool CheckNumerics(const framework::ExecutionContext& exe_ctx, return found_inf_data; } +#endif template class CAllReduceOpASCENDKernel : public framework::OpKernel {