From 0d96158515bf5299a3a8e7eddffea2bc9e2d55c1 Mon Sep 17 00:00:00 2001 From: f2hkop Date: Wed, 3 Mar 2021 14:34:10 +0800 Subject: [PATCH 1/3] send_v2_npu.cc recv_v2_npiu.cc compiled --- .../fluid/operators/collective/recv_v2_op.cc | 8 ++ .../operators/collective/recv_v2_op_npu.cc | 83 +++++++++++++++++++ .../fluid/operators/collective/send_v2_op.cc | 8 ++ .../operators/collective/send_v2_op_npu.cc | 83 +++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu.cc create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu.cc diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc index 10408820387b7..739924e64492f 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cc @@ -62,6 +62,14 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr("dtype", "(int default 5('float32')) data type of tensor.") .SetDefault(5); +#if defined(PADDLE_WITH_ASCEND_CL) + #pragma message("tag") + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + #pragma message("srTag") + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr>("out_shape", "shape of the output tensor.") .SetDefault(std::vector()); AddAttr( diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc new file mode 100644 index 0000000000000..ad2e6e85b4503 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CRecvOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int numel = x->numel(); + hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + std::string tag = ctx.Attr("tag"); + std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + int srcRank = ctx.Attr("peer"); + int srTag = ctx.Attr("srTag"); + + platform::dynload::hcom_receive( + tag.c_str(), reinterpret_cast(const_cast(x->data())), numel, dtype, srcRank, + srTag, group.c_str(), stream); + + VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving " + << x->numel(); + + if (out != x) { + framework::TensorCopy( + *static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc index c5a86b4f08813..ca915ebaabc59 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cc @@ -50,6 +50,14 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("ring_id", "(int default 0) nccl communication ring id.") .SetDefault(0); AddAttr("peer", "(int default 0) rank id for receiver.").SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + #pragma message("tag") + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + #pragma message("srTag") + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc new file mode 100644 index 0000000000000..16af626f88a66 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CSendOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int numel = x->numel(); + hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); + + auto place = ctx.GetPlace(); + int ring_id = ctx.Attr("ring_id"); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + std::string tag = ctx.Attr("tag"); + std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + int destRank = ctx.Attr("peer"); + int srTag = ctx.Attr("srTag"); + + platform::dynload::hcom_send( + tag.c_str(), reinterpret_cast(const_cast(x->data())), numel, dtype, destRank, + srTag, group.c_str(), stream); + + VLOG(3) << "destRank" << destRank << " invoke hcom send. sent " + << x->numel(); + + if (out != x) { + framework::TensorCopy( + *static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel); From 6912c23aa074eda79fb8d515b411a3be0ef287aa Mon Sep 17 00:00:00 2001 From: f2hkop Date: Thu, 4 Mar 2021 16:01:56 +0800 Subject: [PATCH 2/3] hcom send&recv test pass, without hcom_destroy --- .../fluid/operators/collective/CMakeLists.txt | 2 + .../collective/c_reducescatter_op_npu_test.cc | 1 - .../operators/collective/recv_v2_op_npu.cc | 27 ++-- .../collective/recv_v2_op_npu_test.cc | 123 +++++++++++++++++ .../operators/collective/send_v2_op_npu.cc | 10 -- .../collective/send_v2_op_npu_test.cc | 130 ++++++++++++++++++ 6 files changed, 265 insertions(+), 28 deletions(-) create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu_test.cc create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu_test.cc diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 03f376159f045..e34375756dfeb 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -41,3 +41,5 @@ cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc DEPS op_ cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_max_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_allgather_op c_reducescatter_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) +cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc DEPS op_registry send_v2_op recv_v2_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) +cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc DEPS op_registry send_v2_op recv_v2_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) \ No newline at end of file diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index 153bba248e111..2f24f9fad85b6 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -129,7 +129,6 @@ TEST(c_reducescatter, NPU) { char * npu_id=getenv("FLAGS_selected_npus"); p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); - Prepare(&scope, ctx); TestHCCLReduceScatterOp(&scope, ctx); } diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc index ad2e6e85b4503..b0d9d5fd403ed 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -27,10 +27,9 @@ class CRecvOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - auto x = ctx.Input("X"); auto out = ctx.Output("Out"); - int numel = x->numel(); - hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); + int numel = out->numel(); + hcclDataType_t dtype = platform::ToHCCLDataType(out->type()); int ring_id = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); @@ -47,23 +46,17 @@ class CRecvOpASCENDKernel : public framework::OpKernel { std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); int srcRank = ctx.Attr("peer"); int srTag = ctx.Attr("srTag"); - + VLOG(3) << "recv_v2_npu attr get"; platform::dynload::hcom_receive( - tag.c_str(), reinterpret_cast(const_cast(x->data())), numel, dtype, srcRank, + tag.c_str(), reinterpret_cast(const_cast(out->data())), numel, dtype, srcRank, srTag, group.c_str(), stream); - + VLOG(3) << "dtype " << dtype ; + VLOG(3) << "numel " << numel ; + VLOG(3) << "srTag " << srTag ; VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving " - << x->numel(); - - if (out != x) { - framework::TensorCopy( - *static_cast(x), place, - *platform::DeviceContextPool::Instance().Get(place), - static_cast(out)); - } - - out->Resize(x->dims()); - out->set_lod(x->lod()); + << out->numel(); + out->Resize(out->dims()); + out->set_lod(out->lod()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc new file mode 100644 index 0000000000000..d73febedc33f6 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -0,0 +1,123 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(recv_v2); +USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_OP_DEVICE_KERNEL(recv_v2, NPU); + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ + + std::string rank_table_file = getenv("RANK_TABLE_FILE"); + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + int src_rank = atoi(getenv("SRC_RANK")); + int dest_rank = atoi(getenv("DEST_RANK")); + VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" < rank_ids = {0,1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = + f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + VLOG(3) << "CreateOp c_comm_init_hcom"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + + int num = atoi(getenv("DATA_SIZE")); + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank_id:" << rank_id<Var("Out"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"]=std::string("srtest"); + attrs["peer"]=atoi(getenv("SRC_RANK")); + attrs["ring_id"]=0; + attrs["srTag"]=0; + std::vector out_shape; + out_shape.push_back(num); + out_shape.push_back(num); + attrs["out_shape"]=out_shape; + + auto op = + f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs); + VLOG(3) << "CreateOp recv_v2"; + + op->Run(*scope, place); + VLOG(3) << "Run op recv_v2"; + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], i*1.0 + atoi(getenv("DEST_RANK"))); + } +} + + +TEST(recv_v2, NPU){ + f::Scope scope; + char * npu_id=getenv("FLAGS_selected_npus"); + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + Prepare(&scope, ctx); + TestHcomRecvOp(&scope, ctx); + +} \ No newline at end of file diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc index 16af626f88a66..ec27871d847b6 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -28,7 +28,6 @@ class CSendOpASCENDKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); int numel = x->numel(); hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); @@ -55,15 +54,6 @@ class CSendOpASCENDKernel : public framework::OpKernel { VLOG(3) << "destRank" << destRank << " invoke hcom send. sent " << x->numel(); - if (out != x) { - framework::TensorCopy( - *static_cast(x), place, - *platform::DeviceContextPool::Instance().Get(place), - static_cast(out)); - } - - out->Resize(x->dims()); - out->set_lod(x->lod()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc new file mode 100644 index 0000000000000..9f35bec180232 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(send_v2); +USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_OP_DEVICE_KERNEL(send_v2, NPU); + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ + + std::string rank_table_file = getenv("RANK_TABLE_FILE"); + VLOG(3) << "get rank_table_file"; + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + int src_rank = atoi(getenv("SRC_RANK")); + int dest_rank = atoi(getenv("DEST_RANK")); + + printf("rank_table_file: %s, rank_id = %d, device_id = %d\n, src_rank = %d\n, dest_rank = %d\n", rank_table_file.c_str(), rank_id, device_id, src_rank, dest_rank); + VLOG(3) << "get all envs"; + std::vector rank_ids = {0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + comm_init_attrs["rank_ids"] = rank_ids; + VLOG(3) << "comm_init_attrs map over"; + auto comm_init_op = + f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + VLOG(3) << "CreateOp c_comm_init_hcom"; + auto place = ctx.GetPlace(); + VLOG(3) << "get place over"; + + comm_init_op->Run(*scope, place); + ctx.Wait(); + +} + +void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ + std::cout<< "BEGIN TEST:"<< __FUNCTION__ <Var("X"); + auto tensor_x = x->GetMutable(); + int num = atoi(getenv("DATA_SIZE"));; + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + std::cout<< "rank_id:" << rank_id<Resize({num, num}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"]=std::string("srtest"); + attrs["peer"]=atoi(getenv("DEST_RANK")); + attrs["ring_id"]=0; + attrs["srTag"]=0; + + auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs); + + op->Run(*scope, place); + VLOG(3)<<"send run over"; + ctx.Wait(); + + +} + +TEST(send_v2, NPU){ + VLOG(3) << "TEST ENTRY"; + f::Scope scope; + VLOG(3) << "scope "; + char * npu_id=getenv("FLAGS_selected_npus"); + VLOG(3) << "select npu"; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + VLOG(3) << "Place over"; + Prepare(&scope, ctx); + TestHcomSendOp(&scope, ctx); + +} \ No newline at end of file From 6116b12db81b0e695551110a7792c1a351bd1d77 Mon Sep 17 00:00:00 2001 From: f2hkop Date: Thu, 4 Mar 2021 18:10:21 +0800 Subject: [PATCH 3/3] Ascend Send&Recv Test Pass --- .../operators/collective/recv_v2_op_npu.cc | 12 +++---- .../collective/recv_v2_op_npu_test.cc | 10 +++--- .../operators/collective/send_v2_op_npu.cc | 10 +++--- .../collective/send_v2_op_npu_test.cc | 34 ++++--------------- 4 files changed, 22 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc index b0d9d5fd403ed..bf3de9e5ff282 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -47,19 +47,15 @@ class CRecvOpASCENDKernel : public framework::OpKernel { int srcRank = ctx.Attr("peer"); int srTag = ctx.Attr("srTag"); VLOG(3) << "recv_v2_npu attr get"; - platform::dynload::hcom_receive( + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive( tag.c_str(), reinterpret_cast(const_cast(out->data())), numel, dtype, srcRank, - srTag, group.c_str(), stream); - VLOG(3) << "dtype " << dtype ; - VLOG(3) << "numel " << numel ; - VLOG(3) << "srTag " << srTag ; - VLOG(3) << "srcRank " << srcRank << " invoke hcom receive. receiving " - << out->numel(); + srTag, group.c_str(), stream)); + VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving "; out->Resize(out->dims()); out->set_lod(out->lod()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); + "PaddlePaddle should compile with Ascend.")); #endif } }; diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index d73febedc33f6..5b0c60c72a37c 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -107,17 +107,19 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], i*1.0 + atoi(getenv("DEST_RANK"))); - } + std::vector init(num*num, 1.0 * atoi(getenv("DEST_RANK"))); + EXPECT_EQ(out_vec == init, true); } TEST(recv_v2, NPU){ f::Scope scope; char * npu_id=getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + VLOG(3) << "Place over"; Prepare(&scope, ctx); + VLOG(3) << "Prepare over"; TestHcomRecvOp(&scope, ctx); - + VLOG(3) << "Test over"; } \ No newline at end of file diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc index ec27871d847b6..1275d1b314a57 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -47,16 +47,16 @@ class CSendOpASCENDKernel : public framework::OpKernel { int destRank = ctx.Attr("peer"); int srTag = ctx.Attr("srTag"); - platform::dynload::hcom_send( + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send( tag.c_str(), reinterpret_cast(const_cast(x->data())), numel, dtype, destRank, - srTag, group.c_str(), stream); - - VLOG(3) << "destRank" << destRank << " invoke hcom send. sent " + srTag, group.c_str(), stream)); + + VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent " << x->numel(); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); + "PaddlePaddle should compile with Ascend.")); #endif } }; diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index 9f35bec180232..2d0a8af0729d5 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -47,15 +47,12 @@ USE_OP_DEVICE_KERNEL(send_v2, NPU); void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ std::string rank_table_file = getenv("RANK_TABLE_FILE"); - VLOG(3) << "get rank_table_file"; - int rank_id = atoi(getenv("RANK_ID")); int device_id = atoi(getenv("DEVICE_ID")); int src_rank = atoi(getenv("SRC_RANK")); int dest_rank = atoi(getenv("DEST_RANK")); + VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" < rank_ids = {0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; @@ -63,16 +60,11 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; comm_init_attrs["rank_ids"] = rank_ids; - VLOG(3) << "comm_init_attrs map over"; auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); - VLOG(3) << "CreateOp c_comm_init_hcom"; auto place = ctx.GetPlace(); - VLOG(3) << "get place over"; - comm_init_op->Run(*scope, place); ctx.Wait(); - } void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ @@ -82,23 +74,13 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ int num = atoi(getenv("DATA_SIZE"));; EXPECT_GT(num, 0); EXPECT_LT(num, 1 << 15); - std::vector init; + std::vector init(num*num, 1.0 * atoi(getenv("DEST_RANK"))); int rank_id = atoi(getenv("RANK_ID")); - std::cout<< "rank_id:" << rank_id<Resize({num, num}); - ctx.Wait(); - auto place = ctx.GetPlace(); - ctx.Wait(); f::AttributeMap attrs; @@ -111,20 +93,18 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ op->Run(*scope, place); VLOG(3)<<"send run over"; - ctx.Wait(); - - + ctx.Wait(); } TEST(send_v2, NPU){ - VLOG(3) << "TEST ENTRY"; f::Scope scope; - VLOG(3) << "scope "; char * npu_id=getenv("FLAGS_selected_npus"); - VLOG(3) << "select npu"; + VLOG(3) << "Select npu:" << npu_id; p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); VLOG(3) << "Place over"; Prepare(&scope, ctx); + VLOG(3) << "Prepare over"; TestHcomSendOp(&scope, ctx); + VLOG(3) << "Test over"; } \ No newline at end of file