From 0d96158515bf5299a3a8e7eddffea2bc9e2d55c1 Mon Sep 17 00:00:00 2001
From: f2hkop <f2huestc@outlook.com>
Date: Wed, 3 Mar 2021 14:34:10 +0800
Subject: [PATCH 1/3] send_v2_npu.cc recv_v2_npiu.cc compiled

---
 .../fluid/operators/collective/recv_v2_op.cc  |  8 ++
 .../operators/collective/recv_v2_op_npu.cc    | 83 +++++++++++++++++++
 .../fluid/operators/collective/send_v2_op.cc  |  8 ++
 .../operators/collective/send_v2_op_npu.cc    | 83 +++++++++++++++++++
 4 files changed, 182 insertions(+)
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu.cc
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 10408820387b7..739924e64492f 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -62,6 +62,14 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    #pragma message("tag")
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    #pragma message("srTag")
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 0000000000000..ad2e6e85b4503
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    std::string tag = ctx.Attr<std::string>("tag");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    int srcRank = ctx.Attr<int>("peer");
+    int srTag = ctx.Attr<int>("srTag");
+
+    platform::dynload::hcom_receive(
+        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype, srcRank,
+          srTag, group.c_str(), stream);
+
+      VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving "
+              << x->numel();
+
+      if (out != x) {
+        framework::TensorCopy(
+            *static_cast<const framework::Tensor*>(x), place,
+            *platform::DeviceContextPool::Instance().Get(place),
+            static_cast<framework::Tensor*>(out));
+      }
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<float>,
+                        ops::CRecvOpASCENDKernel<int>,
+                        ops::CRecvOpASCENDKernel<int8_t>,
+                        ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f08813..ca915ebaabc59 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,14 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    #pragma message("tag")
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    #pragma message("srTag")
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 0000000000000..16af626f88a66
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = x->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+
+    auto place = ctx.GetPlace();
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    std::string tag = ctx.Attr<std::string>("tag");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    int destRank = ctx.Attr<int>("peer");
+    int srTag = ctx.Attr<int>("srTag");
+
+    platform::dynload::hcom_send(
+        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype, destRank,
+          srTag, group.c_str(), stream);
+
+      VLOG(3) << "destRank" << destRank << " invoke hcom send. sent "
+              << x->numel();
+
+      if (out != x) {
+        framework::TensorCopy(
+            *static_cast<const framework::Tensor*>(x), place,
+            *platform::DeviceContextPool::Instance().Get(place),
+            static_cast<framework::Tensor*>(out));
+      }
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<float>,
+                        ops::CSendOpASCENDKernel<int>,
+                        ops::CSendOpASCENDKernel<int8_t>,
+                        ops::CSendOpASCENDKernel<plat::float16>);

From 6912c23aa074eda79fb8d515b411a3be0ef287aa Mon Sep 17 00:00:00 2001
From: f2hkop <f2huestc@outlook.com>
Date: Thu, 4 Mar 2021 16:01:56 +0800
Subject: [PATCH 2/3] hcom send&recv test pass, without hcom_destroy

---
 .../fluid/operators/collective/CMakeLists.txt |   2 +
 .../collective/c_reducescatter_op_npu_test.cc |   1 -
 .../operators/collective/recv_v2_op_npu.cc    |  27 ++--
 .../collective/recv_v2_op_npu_test.cc         | 123 +++++++++++++++++
 .../operators/collective/send_v2_op_npu.cc    |  10 --
 .../collective/send_v2_op_npu_test.cc         | 130 ++++++++++++++++++
 6 files changed, 265 insertions(+), 28 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu_test.cc

diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 03f376159f045..e34375756dfeb 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -41,3 +41,5 @@ cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc DEPS op_
 cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_max_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc DEPS op_registry send_v2_op recv_v2_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc DEPS op_registry send_v2_op recv_v2_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
\ No newline at end of file
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 153bba248e111..2f24f9fad85b6 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -129,7 +129,6 @@ TEST(c_reducescatter, NPU) {
   char * npu_id=getenv("FLAGS_selected_npus");
 
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-
   Prepare(&scope, ctx);
   TestHCCLReduceScatterOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index ad2e6e85b4503..b0d9d5fd403ed 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,10 +27,9 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto x = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
-    int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    int numel = out->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
@@ -47,23 +46,17 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
     int srcRank = ctx.Attr<int>("peer");
     int srTag = ctx.Attr<int>("srTag");
-
+    VLOG(3) << "recv_v2_npu attr get";
     platform::dynload::hcom_receive(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype, srcRank,
+        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), numel, dtype, srcRank,
           srTag, group.c_str(), stream);
-
+      VLOG(3) << "dtype " << dtype ;
+      VLOG(3) << "numel " << numel ;
+      VLOG(3) << "srTag " << srTag ;
       VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving "
-              << x->numel();
-
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x), place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
-      }
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
+              << out->numel();
+    out->Resize(out->dims());
+    out->set_lod(out->lod());
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..d73febedc33f6
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,123 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+    std::string rank_table_file = getenv("RANK_TABLE_FILE");
+    int rank_id = atoi(getenv("RANK_ID"));
+    int device_id = atoi(getenv("DEVICE_ID"));
+    int src_rank = atoi(getenv("SRC_RANK"));
+    int dest_rank = atoi(getenv("DEST_RANK"));
+    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
+
+    std::vector<int> rank_ids = {0,1};
+    f::AttributeMap comm_init_attrs;
+    comm_init_attrs["ring_id"] = 0;
+    comm_init_attrs["nranks"] = 2;
+    comm_init_attrs["rank"] = rank_id;
+    comm_init_attrs["device_id"] = device_id;
+    comm_init_attrs["rank_ids"] = rank_ids;
+    auto comm_init_op =
+    f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+    VLOG(3) << "CreateOp c_comm_init_hcom";
+    auto place = ctx.GetPlace();
+    comm_init_op->Run(*scope, place);
+    ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+    int num = atoi(getenv("DATA_SIZE"));
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3) << "rank_id:" << rank_id<<std::endl;
+    std::cout<<std::endl;
+
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    auto out = scope->Var("Out");
+    auto tensor_out = out->GetMutable<f::LoDTensor>();
+    tensor_out->Resize({num, num});
+    tensor_out->mutable_data<float>(place);  // allocate
+
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("SRC_RANK"));  
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+    std::vector<int> out_shape;
+    out_shape.push_back(num);
+    out_shape.push_back(num);
+    attrs["out_shape"]=out_shape;
+
+    auto op =
+        f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
+    VLOG(3) << "CreateOp recv_v2";
+
+    op->Run(*scope, place);
+    VLOG(3) << "Run op recv_v2";
+    std::vector<float> out_vec;
+    TensorToVector(*tensor_out, ctx, &out_vec);
+    ctx.Wait();
+    for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], i*1.0 + atoi(getenv("DEST_RANK")));
+  }
+}
+
+
+TEST(recv_v2, NPU){
+    f::Scope scope;
+    char * npu_id=getenv("FLAGS_selected_npus");
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+    Prepare(&scope, ctx);
+    TestHcomRecvOp(&scope, ctx);
+
+}
\ No newline at end of file
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 16af626f88a66..ec27871d847b6 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -28,7 +28,6 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
     auto x = ctx.Input<framework::LoDTensor>("X");
-    auto out = ctx.Output<framework::LoDTensor>("Out");
     int numel = x->numel();
     hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
 
@@ -55,15 +54,6 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
       VLOG(3) << "destRank" << destRank << " invoke hcom send. sent "
               << x->numel();
 
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x), place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
-      }
-
-    out->Resize(x->dims());
-    out->set_lod(x->lod());
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..9f35bec180232
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+    std::string rank_table_file = getenv("RANK_TABLE_FILE");
+    VLOG(3) << "get rank_table_file";
+
+    int rank_id = atoi(getenv("RANK_ID"));
+    int device_id = atoi(getenv("DEVICE_ID"));
+    int src_rank = atoi(getenv("SRC_RANK"));
+    int dest_rank = atoi(getenv("DEST_RANK"));
+
+    printf("rank_table_file: %s, rank_id = %d, device_id = %d\n, src_rank = %d\n, dest_rank = %d\n", rank_table_file.c_str(), rank_id, device_id, src_rank, dest_rank);
+    VLOG(3) << "get all envs";
+    std::vector<int> rank_ids = {0, 1};
+    f::AttributeMap comm_init_attrs;
+    comm_init_attrs["ring_id"] = 0;
+    comm_init_attrs["nranks"] = 2;
+    comm_init_attrs["rank"] = rank_id;
+    comm_init_attrs["device_id"] = device_id;
+    comm_init_attrs["rank_ids"] = rank_ids;
+    VLOG(3) << "comm_init_attrs map over";
+    auto comm_init_op =
+        f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+    VLOG(3) << "CreateOp c_comm_init_hcom";
+    auto place = ctx.GetPlace();
+    VLOG(3) << "get place over";
+
+    comm_init_op->Run(*scope, place);
+    ctx.Wait();
+
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
+    auto x = scope->Var("X");
+    auto tensor_x = x->GetMutable<f::LoDTensor>();
+    int num = atoi(getenv("DATA_SIZE"));;
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    std::vector<float> init;
+    int rank_id = atoi(getenv("RANK_ID"));
+    std::cout<< "rank_id:" << rank_id<<std::endl;
+
+    for (uint32_t i = 0; i < num * num; ++i) {
+    init.push_back(i*1.0 + atoi(getenv("DEST_RANK")));
+    std::cout<< init[i];
+    }
+
+    std::cout<<std::endl;
+    TensorFromVector(init, ctx, tensor_x);
+    tensor_x->Resize({num, num});
+
+    ctx.Wait();
+
+    auto place = ctx.GetPlace();
+
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("DEST_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+
+    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
+    
+    op->Run(*scope, place);
+    VLOG(3)<<"send run over";
+    ctx.Wait();
+  
+    
+}
+
+TEST(send_v2, NPU){
+    VLOG(3) << "TEST ENTRY";
+    f::Scope scope;
+    VLOG(3) << "scope ";
+    char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "select npu";
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+    VLOG(3) << "Place over";
+    Prepare(&scope, ctx);
+    TestHcomSendOp(&scope, ctx);
+
+}
\ No newline at end of file

From 6116b12db81b0e695551110a7792c1a351bd1d77 Mon Sep 17 00:00:00 2001
From: f2hkop <f2huestc@outlook.com>
Date: Thu, 4 Mar 2021 18:10:21 +0800
Subject: [PATCH 3/3] Ascend Send&Recv Test Pass

---
 .../operators/collective/recv_v2_op_npu.cc    | 12 +++----
 .../collective/recv_v2_op_npu_test.cc         | 10 +++---
 .../operators/collective/send_v2_op_npu.cc    | 10 +++---
 .../collective/send_v2_op_npu_test.cc         | 34 ++++---------------
 4 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index b0d9d5fd403ed..bf3de9e5ff282 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -47,19 +47,15 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
     int srcRank = ctx.Attr<int>("peer");
     int srTag = ctx.Attr<int>("srTag");
     VLOG(3) << "recv_v2_npu attr get";
-    platform::dynload::hcom_receive(
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
         tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), numel, dtype, srcRank,
-          srTag, group.c_str(), stream);
-      VLOG(3) << "dtype " << dtype ;
-      VLOG(3) << "numel " << numel ;
-      VLOG(3) << "srTag " << srTag ;
-      VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving "
-              << out->numel();
+          srTag, group.c_str(), stream));
+     VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
     out->Resize(out->dims());
     out->set_lod(out->lod());
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with Ascend."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index d73febedc33f6..5b0c60c72a37c 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -107,17 +107,19 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
     std::vector<float> out_vec;
     TensorToVector(*tensor_out, ctx, &out_vec);
     ctx.Wait();
-    for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], i*1.0 + atoi(getenv("DEST_RANK")));
-  }
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    EXPECT_EQ(out_vec == init, true);
 }
 
 
 TEST(recv_v2, NPU){
     f::Scope scope;
     char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "Select npu:" << npu_id;
     p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+    VLOG(3) << "Place over";
     Prepare(&scope, ctx);
+    VLOG(3) << "Prepare over";
     TestHcomRecvOp(&scope, ctx);
-
+    VLOG(3) << "Test over";
 }
\ No newline at end of file
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index ec27871d847b6..1275d1b314a57 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -47,16 +47,16 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
     int destRank = ctx.Attr<int>("peer");
     int srTag = ctx.Attr<int>("srTag");
 
-    platform::dynload::hcom_send(
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
         tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype, destRank,
-          srTag, group.c_str(), stream);
-
-      VLOG(3) << "destRank" << destRank << " invoke hcom send. sent "
+          srTag, group.c_str(), stream));
+    
+      VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
               << x->numel();
 
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with Ascend."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index 9f35bec180232..2d0a8af0729d5 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -47,15 +47,12 @@ USE_OP_DEVICE_KERNEL(send_v2, NPU);
 void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
     std::string rank_table_file = getenv("RANK_TABLE_FILE");
-    VLOG(3) << "get rank_table_file";
-
     int rank_id = atoi(getenv("RANK_ID"));
     int device_id = atoi(getenv("DEVICE_ID"));
     int src_rank = atoi(getenv("SRC_RANK"));
     int dest_rank = atoi(getenv("DEST_RANK"));
+    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
 
-    printf("rank_table_file: %s, rank_id = %d, device_id = %d\n, src_rank = %d\n, dest_rank = %d\n", rank_table_file.c_str(), rank_id, device_id, src_rank, dest_rank);
-    VLOG(3) << "get all envs";
     std::vector<int> rank_ids = {0, 1};
     f::AttributeMap comm_init_attrs;
     comm_init_attrs["ring_id"] = 0;
@@ -63,16 +60,11 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
     comm_init_attrs["rank"] = rank_id;
     comm_init_attrs["device_id"] = device_id;
     comm_init_attrs["rank_ids"] = rank_ids;
-    VLOG(3) << "comm_init_attrs map over";
     auto comm_init_op =
         f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-    VLOG(3) << "CreateOp c_comm_init_hcom";
     auto place = ctx.GetPlace();
-    VLOG(3) << "get place over";
-
     comm_init_op->Run(*scope, place);
     ctx.Wait();
-
 }
 
 void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
@@ -82,23 +74,13 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
     int num = atoi(getenv("DATA_SIZE"));;
     EXPECT_GT(num, 0);
     EXPECT_LT(num, 1 << 15);
-    std::vector<float> init;
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
     int rank_id = atoi(getenv("RANK_ID"));
-    std::cout<< "rank_id:" << rank_id<<std::endl;
-
-    for (uint32_t i = 0; i < num * num; ++i) {
-    init.push_back(i*1.0 + atoi(getenv("DEST_RANK")));
-    std::cout<< init[i];
-    }
-
-    std::cout<<std::endl;
+    VLOG(3)<<"rank id:"<<rank_id;
     TensorFromVector(init, ctx, tensor_x);
     tensor_x->Resize({num, num});
-
     ctx.Wait();
-
     auto place = ctx.GetPlace();
-
     ctx.Wait();
 
     f::AttributeMap attrs;
@@ -111,20 +93,18 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
     
     op->Run(*scope, place);
     VLOG(3)<<"send run over";
-    ctx.Wait();
-  
-    
+    ctx.Wait();    
 }
 
 TEST(send_v2, NPU){
-    VLOG(3) << "TEST ENTRY";
     f::Scope scope;
-    VLOG(3) << "scope ";
     char * npu_id=getenv("FLAGS_selected_npus");
-    VLOG(3) << "select npu";
+    VLOG(3) << "Select npu:" << npu_id;
     p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
     VLOG(3) << "Place over";
     Prepare(&scope, ctx);
+    VLOG(3) << "Prepare over";
     TestHcomSendOp(&scope, ctx);
+    VLOG(3) << "Test over";
 
 }
\ No newline at end of file