From 44026d41acea604a571b656dddfcab1287097272 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jul 2021 18:30:03 +0800
Subject: [PATCH 1/6] add numeric test=develop

---
 paddle/fluid/framework/section_worker.cc      |  3 +
 .../operators/collective/c_allreduce_op.h     | 99 +++++++++++++++++--
 2 files changed, 94 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index f68ee153e0025..a4f0d3be4aa8c 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -180,6 +180,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
 
     fw_step += 1;
     bw_step += 1;
+    VLog(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step;
   }
 
   int reserve_bw_send_step = bw_step - 2;
@@ -187,8 +188,10 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
   while (bw_step < num_microbatches_) {
     RunBackward(bw_step, gc, unused_vars_);
     bw_step += 1;
+    VLog(2) << "micro steps  bw_step:" << bw_step;
   }
 
+  VLog(2) << "run update";
   RunUpdate(gc, unused_vars_);
 
   if (gc) {
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 3a74f551e7a30..4d027d42532f2 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
@@ -119,13 +120,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+// return true if found_inf_or_nan or return false;
+template <typename T>
+bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
+                   aclrtStream stream, const paddle::framework::Tensor* in) {
+  auto& dev_ctx =
+      exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
+  using Tensor = paddle::framework::Tensor;
+  Tensor out(in->type());
+  out.Resize(in->dims());
+  out.mutable_data<T>(dev_ctx.GetPlace());
+
+  bool found_inf_data = false;
+
+  try {
+    const auto& runner =
+        NpuOpRunner("CheckNumerics", {*in}, {out},
+                    {{"message", std::string("check_numberics")}});
+    runner.Run(stream);
+    dev_ctx.Wait();
+  } catch (platform::EnforceNotMet& exception) {
+    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+    found_inf_data = true;
+  } catch (...) {
+    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+    found_inf_data = true;
+  }
+
+  return found_inf_data;
+}
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto in = ctx.Input<framework::LoDTensor>("X");
-    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
     auto place = ctx.GetPlace();
     HcclDataType dtype = platform::ToHCCLDataType(in->type());
     int64_t numel = in->numel();
@@ -141,9 +173,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
         paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
 
     aclrtStream stream = nullptr;
-    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
     if (ctx.Attr<bool>("use_calc_stream")) {
-      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+      stream = dev_ctx->stream();
     } else {
       stream = comm->stream();
     }
@@ -171,9 +204,53 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-    VLOG(3) << "begin hccl allreduce, parameter is: "
+    VLOG(3) << "hccl allreduce, parameter is: "
+            << "input num: " << in->dims() << "dtype: " << dtype
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group
+            << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
+            << ", out_size:" << out->memory_size()
+            << ", use_calc_stream:" << ctx.Attr<bool>("use_calc_stream")
+            << ", stream:" << stream;
+
+    framework::Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+
+    bool nan_or_inf = false;
+    bool check_numerics = false;
+    if (float_status) {
+      VLOG(4) << "prepare to FoundNanInf";
+      nan_or_inf = FoundNanOrInf(
+          ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+          dev_ctx->stream(), float_status, &tmp);
+    } else {
+      auto d_type = in->type();
+      switch (d_type) {
+        case framework::proto::VarType::FP16:
+        case framework::proto::VarType::FP32: {
+          VLOG(4) << "prepare to FoundNanInf";
+          check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
+          VLOG(4) << "check_numerics:" << check_numerics;
+          break;
+        }
+        default:
+          break;
+      }
+    }
+
+    if (nan_or_inf || check_numerics) {
+      T inf = static_cast<T>(std::numeric_limits<float>::infinity());
+      VLOG(4) << "fill input data constant inf";
+      auto dims = in->dims();
+      auto mutable_in = const_cast<framework::Tensor*>(in);
+      FillNpuTensorWithConstant<T>(mutable_in, inf);
+      mutable_in->Resize(dims);
+    }
+
+    VLOG(3) << "hccl allreduce, parameter is: "
             << "input num: " << numel << "dtype: " << dtype
-            << "hccl_red_type: " << hccl_red_type << ", group is: " << group;
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group
+            << ", sendbuff:" << sendbuff << ", recvbuff:" << recvbuff
+            << ", out_size:" << out->memory_size();
 
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
         sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
@@ -198,7 +275,7 @@ class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     BKCLDataType dtype = platform::ToBKCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
@@ -260,7 +337,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
-    const void* sendbuff = in->data<void>();
+    const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
     void* recvbuff = out->mutable_data<T>(place);
 
@@ -328,6 +405,12 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "parallel mode, the backward is c_identity which returns itself for "
         "c_allreduce_sum.")
         .SetDefault(false);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddInput("FloatStatus",
+             "(Tensor) 1-dim tensor of shape [8], allocated by "
+             "alloc_float_status op")
+        .AsDispensable();
+#endif
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 

From 30492c1850338c7986f7299f0c3cafd3f4843042 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jul 2021 20:10:21 +0800
Subject: [PATCH 2/6] cleanup test=develop

---
 .../operators/collective/c_allreduce_op.h     | 38 ++++++-------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 4d027d42532f2..eaa219c28267a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -157,7 +157,6 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
-    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
     auto place = ctx.GetPlace();
     HcclDataType dtype = platform::ToHCCLDataType(in->type());
     int64_t numel = in->numel();
@@ -215,29 +214,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
     framework::Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
-    bool nan_or_inf = false;
     bool check_numerics = false;
-    if (float_status) {
-      VLOG(4) << "prepare to FoundNanInf";
-      nan_or_inf = FoundNanOrInf(
-          ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-          dev_ctx->stream(), float_status, &tmp);
-    } else {
-      auto d_type = in->type();
-      switch (d_type) {
-        case framework::proto::VarType::FP16:
-        case framework::proto::VarType::FP32: {
-          VLOG(4) << "prepare to FoundNanInf";
-          check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
-          VLOG(4) << "check_numerics:" << check_numerics;
-          break;
-        }
-        default:
-          break;
+
+    auto d_type = in->type();
+    switch (d_type) {
+      case framework::proto::VarType::FP16:
+      case framework::proto::VarType::FP32: {
+        VLOG(4) << "prepare to FoundNanInf";
+        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << check_numerics;
+        break;
       }
+      default:
+        break;
     }
 
-    if (nan_or_inf || check_numerics) {
+    if (check_numerics) {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
@@ -405,12 +397,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "parallel mode, the backward is c_identity which returns itself for "
         "c_allreduce_sum.")
         .SetDefault(false);
-#if defined(PADDLE_WITH_ASCEND_CL)
-    AddInput("FloatStatus",
-             "(Tensor) 1-dim tensor of shape [8], allocated by "
-             "alloc_float_status op")
-        .AsDispensable();
-#endif
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 

From 3e42fd65c4b149a2132edef2418619feffa3e434 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jul 2021 20:11:38 +0800
Subject: [PATCH 3/6] cleanup test=develop

---
 paddle/fluid/framework/section_worker.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index a4f0d3be4aa8c..146798d557320 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -180,7 +180,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
 
     fw_step += 1;
     bw_step += 1;
-    VLog(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step;
+    VLOG(2) << "micro steps fw_step:" << fw_step << ", bw_step:" << bw_step;
   }
 
   int reserve_bw_send_step = bw_step - 2;
@@ -188,10 +188,10 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
   while (bw_step < num_microbatches_) {
     RunBackward(bw_step, gc, unused_vars_);
     bw_step += 1;
-    VLog(2) << "micro steps  bw_step:" << bw_step;
+    VLOG(2) << "micro steps  bw_step:" << bw_step;
   }
 
-  VLog(2) << "run update";
+  VLOG(2) << "run update";
   RunUpdate(gc, unused_vars_);
 
   if (gc) {

From bf95721f041f65676c06bae769cef02309aab06a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jul 2021 20:38:57 +0800
Subject: [PATCH 4/6] cleanup test=develop

---
 paddle/fluid/framework/section_worker.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 146798d557320..2b00c5c674fb1 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -164,6 +164,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
   while (fw_step < startup_steps) {
     RunForward(fw_step, gc, unused_vars_);
     fw_step += 1;
+    VLOG(2) << "micro steps fw_step:" << fw_step;
   }
 
   // 1f1b phase

From 091f11fd2044699bcee7ef3ebd0b259e3d56d994 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 28 Jul 2021 21:30:54 +0800
Subject: [PATCH 5/6] cleanup test=develop

---
 paddle/fluid/framework/section_worker.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 2b00c5c674fb1..5df01e151f805 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -207,6 +207,7 @@ void SectionWorker::Run1F1B(std::unique_ptr<GarbageCollector> &gc) {
 
 void SectionWorker::TrainFiles() {
   VLOG(5) << "begin section_worker TrainFiles";
+  VLOG(2) << "mini batch steps:" << batch_id_;
 
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;

From 130a99d8e49f6b6998aa646d7a9242d55d4b1007 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 29 Jul 2021 14:24:22 +0800
Subject: [PATCH 6/6] fix compilation test=develop

---
 paddle/fluid/operators/collective/c_allreduce_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index eaa219c28267a..3c51c65b07390 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -120,6 +120,7 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+#if defined(PADDLE_WITH_ASCEND_CL)
 // return true if found_inf_or_nan or return false;
 template <typename T>
 bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
@@ -149,6 +150,7 @@ bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
 
   return found_inf_data;
 }
+#endif
 
 template <ReduceType red_type, typename T>
 class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {