Skip to content

Commit

Permalink
update, test=develop
Browse files Browse the repository at this point in the history
  • Loading branch information
sandyhouse committed Apr 22, 2021
1 parent c1ae39f commit efe37a9
Showing 1 changed file with 148 additions and 1 deletion.
149 changes: 148 additions & 1 deletion paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,31 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif

#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif

#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif

#if defined(PADDLE_WITH_GLOO)
#include <gloo/allreduce.h>
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif

#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/hccl_helper.h"
#endif

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -105,6 +119,135 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
}
};

template <ReduceType red_type, typename T>
class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_ASCEND_CL)
auto in = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
auto place = ctx.GetPlace();
HcclDataType dtype = platform::ToHCCLDataType(in->type());
int64_t numel = in->numel();

void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
void* recvbuff = reinterpret_cast<void*>(out->data<T>());

int ring_id = ctx.Attr<int>("ring_id");
std::string group =
std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
auto comm =
paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);

aclrtStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}

HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
switch (red_type) {
case kRedSum:
hccl_red_type = HCCL_REDUCE_SUM;
break;

case kRedMax:
hccl_red_type = HCCL_REDUCE_MAX;
break;

case kRedMin:
hccl_red_type = HCCL_REDUCE_MIN;
break;

case kRedProd:
hccl_red_type = HCCL_REDUCE_PROD;
break;

default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}

VLOG(3) << "begin hccl allreduce, parameter is: "
<< "input num: " << numel << "dtype: " << dtype
<< "hccl_red_type: " << hccl_red_type << ", group is: " << group;

PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
reinterpret_cast<void*>(stream)));

out->Resize(in->dims());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU."));
#endif
}
};

template <ReduceType red_type, typename T>
class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_XPU_BKCL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");

auto place = ctx.GetPlace();
BKCLDataType dtype = platform::ToBKCLDataType(in->type());
int64_t numel = in->numel();
const void* sendbuff = in->data<void>();
out->Resize(in->dims());
void* recvbuff = out->mutable_data<T>(place);

int rid = ctx.Attr<int>("ring_id");
auto comm = platform::BKCLCommContext::Instance().Get(rid, place);

XPUStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
->x_context()
->xpu_stream;
} else {
stream = comm->stream();
}

BKCLOp bkcl_red_type = BKCL_ADD;
switch (red_type) {
case kRedSum:
bkcl_red_type = BKCL_ADD;
break;

case kRedMax:
bkcl_red_type = BKCL_MAX;
break;

case kRedMin:
bkcl_red_type = BKCL_MIN;
break;

case kRedProd:
bkcl_red_type = BKCL_PRODUCT;
break;

default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid reduce type: %d", red_type));
}

PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
dtype, bkcl_red_type, stream),
BKCL_SUCCESS, platform::errors::PreconditionNotMet(
"BKCL all reduce failed"));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should be compiled with XPU."));
#endif
}
};

template <ReduceType red_type, typename T>
class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
public:
Expand Down Expand Up @@ -170,6 +313,10 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out", "(Tensor) the allreduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
.SetDefault("tag");
#endif
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
Expand Down

1 comment on commit efe37a9

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.