Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] support global accumulator for adam #32780

Merged
merged 10 commits into from
May 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion paddle/fluid/operators/optimizers/adam_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);

AddComment(R"DOC(
Adam Optimizer.
Expand Down Expand Up @@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
"EpsilonTensor",
"If provided, Adam will use this as epsilon, "
"this has a higher priority than attr(epsilon). "
"For better performance in npu kernel. "));
"For better performance in npu kernel. "))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 attribute [use_global_beta_pow].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_global_beta_pow",
"If true, Adam will use global beta_pow for whole model "
"instead of creating beta_pow for each parameter."
"In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
"used in adam op, "
"and beta_pow will be updated after all adam op in the model.",
false));
55 changes: 32 additions & 23 deletions paddle/fluid/operators/optimizers/adam_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;

auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
Expand Down Expand Up @@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
if (!use_global_beta_pow) {
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
Expand All @@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
if (!use_global_beta_pow) {
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}

} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
Expand Down Expand Up @@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode, ndim);
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
if (!use_global_beta_pow) {
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
SparseAdamFunctor<T, GPUAdam, MPDType> functor(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
Expand All @@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
ctx.device_context()),
param->numel());
for_range(functor);
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
Expand Down
23 changes: 14 additions & 9 deletions paddle/fluid/operators/optimizers/adam_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;

auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
Expand Down Expand Up @@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr->data<T>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()));
functor(param->numel());
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];

if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
Expand Down Expand Up @@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode);
// update beta1 and beta2
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta1 * beta1_pow->data<T>()[0];
beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
beta2 * beta2_pow->data<T>()[0];
}
if (lazy_mode) {
VLOG(3) << "run cpu lazy mode";
size_t row_count = grad_merge.rows().size();
Expand Down
44 changes: 25 additions & 19 deletions paddle/fluid/operators/optimizers/adam_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ class AdamNPUKernel : public framework::OpKernel<T> {
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
Expand All @@ -50,34 +49,37 @@ class AdamNPUKernel : public framework::OpKernel<T> {
auto* mom2 = ctx.Input<LoDTensor>("Moment2");
auto* lr = ctx.Input<LoDTensor>("LearningRate");

auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");

auto* param_out = ctx.Output<LoDTensor>("ParamOut");
auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");

bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;

param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace());

// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
// place.
LoDTensor beta1_pow_tmp;
LoDTensor beta2_pow_tmp;
if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>();
// `mutable_data` operation needs to be done after getting data
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
} else {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
beta1_pow = &beta1_pow_tmp;
}
if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>();
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
} else {
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
beta2_pow = &beta2_pow_tmp;
}

const Tensor* beta1_tensor = nullptr;
Expand Down Expand Up @@ -174,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
*mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
}
auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
auto runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
auto runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
}
}
};

Expand Down
82 changes: 44 additions & 38 deletions paddle/fluid/operators/optimizers/adam_op_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
"value is:%d.",
beta2_pow_out->numel()));

bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;

T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
Expand Down Expand Up @@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
mom1_out.template mutable_data<T>(ctx.GetPlace()),
mom2_out.template mutable_data<T>(ctx.GetPlace()),
param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());

// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const T* beta1_pow_p = beta1_pow.template data<T>();
beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const T* beta2_pow_p = beta2_pow.template data<T>();
beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];
} else {
T cpu_beta1_pow_out_data;
T cpu_beta2_pow_out_data;
memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
beta1_pow_ptr, sizeof(T));

cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
beta2_pow_ptr, sizeof(T));

cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;

T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta1_pow_out_p, platform::CPUPlace(),
&cpu_beta1_pow_out_data, sizeof(T));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta2_pow_out_p, platform::CPUPlace(),
&cpu_beta2_pow_out_data, sizeof(T));
if (!use_global_beta_pow) {
// update in cpu and then copy to xpu
if (beta1_pow.place() == platform::CPUPlace() &&
beta2_pow.place() == platform::CPUPlace()) {
const T* beta1_pow_p = beta1_pow.template data<T>();
beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta1 * beta1_pow_p[0];
const T* beta2_pow_p = beta2_pow.template data<T>();
beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
beta2 * beta2_pow_p[0];

} else {
T cpu_beta1_pow_out_data;
T cpu_beta2_pow_out_data;

memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
beta1_pow_ptr, sizeof(T));

cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
beta2_pow_ptr, sizeof(T));

cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;

T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta1_pow_out_p, platform::CPUPlace(),
&cpu_beta1_pow_out_data, sizeof(T));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
beta2_pow_out_p, platform::CPUPlace(),
&cpu_beta2_pow_out_data, sizeof(T));
}

PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
}

PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
platform::errors::External(
"XPU API return wrong value[%d], please check "
"where Baidu Kunlun Card is properly installed.",
r));
} else {
PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
"Variable type not supported by adam_op"));
Expand Down
Loading