diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc index 89ea35ca9e7f9..d491593d0af11 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h" - #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -203,6 +203,3 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum, ops::DGCMomentumOp, ops::DGCMomentumOpMaker); - -PD_REGISTER_STRUCT_KERNEL( - dgc_momentum, CPU, ALL_LAYOUT, ops::DGCMomentumKernel, float) {} diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h deleted file mode 100644 index 18d0a78e4bd10..0000000000000 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/momentum_kernel.h" -#include "paddle/phi/kernels/sgd_kernel.h" - -namespace paddle { -namespace operators { - -template -class DGCMomentumKernel : public framework::OpKernel { - public: - DGCMomentumKernel() {} - - void Compute(const framework::ExecutionContext& context) const override { - auto rampup_begin_step = context.Attr("rampup_begin_step"); - if (static_cast(rampup_begin_step) < 0) { - return; - } - - auto current_step_tensor = context.Input("current_step"); - auto* current_step = current_step_tensor->data(); - - // nranks - auto nranks_tensor = context.Input("nranks"); - const int nranks = static_cast(*nranks_tensor->data()); - PADDLE_ENFORCE_GT( - nranks, - 1, - platform::errors::InvalidArgument( - "DGC is not useful when num_trainers <= 1, but now nranks=%d", - nranks)); - - const phi::DenseTensor* g = context.Input("Grad"); - phi::DenseTensor* g_out = context.Output("Grad_out"); - auto g_e = framework::EigenVector::Flatten(*g); - auto g_out_e = framework::EigenVector::Flatten(*g_out); - - auto& dev_ctx = context.template device_context(); - auto& eigen_ctx = *dev_ctx.eigen_device(); - - // NOTE. In dgc_op we multi grad with nranks, so we need /nranks here. - g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e; - - VLOG(10) << "current_step:" << *current_step - << ", rampup_begin_step:" << rampup_begin_step; - - const auto* grad_var = context.InputVar("Grad"); - if (static_cast(*current_step) < static_cast(rampup_begin_step)) { - VLOG(10) << " so use momentum optimizer"; - auto* learning_rate = context.Input("LearningRate"); - bool multi_precision = context.Attr("multi_precision"); - - auto* param = context.Input("Param"); - auto* velocity = context.Input("Velocity"); - auto* param_out = context.Output("ParamOut"); - auto* velocity_out = context.Output("VelocityOut"); - auto* master_param_out = - context.Output("MasterParamOut"); - paddle::optional master_param_opt(paddle::none); - float mu = context.Attr("mu"); - bool use_nesterov = context.Attr("use_nesterov"); - std::string regularization_method = - context.Attr("regularization_method"); - float regularization_coeff = context.Attr("regularization_coeff"); - float rescale_grad = context.Attr("rescale_grad"); - - if (grad_var->IsType()) { - // sgd_dense - auto* grad = context.Input("Grad"); - phi::MomentumDenseKernel( - static_cast::TYPE&>(dev_ctx), - *param, - *grad, - *velocity, - *learning_rate, - master_param_opt, - mu, - use_nesterov, - regularization_method, - regularization_coeff, - multi_precision, - rescale_grad, - param_out, - velocity_out, - master_param_out); - } else { - // sgd dense param sparse grad - auto* grad = context.Input("Grad"); - phi::MomentumSparseKernel( - static_cast::TYPE&>(dev_ctx), - *param, - *grad, - *velocity, - *learning_rate, - master_param_opt, - mu, - use_nesterov, - regularization_method, - regularization_coeff, - multi_precision, - rescale_grad, - param_out, - velocity_out, - master_param_out); - } - - return; - } - - VLOG(10) << " so use sgd optimizer"; - - const auto* param_var = context.InputVar("Param"); - - auto* learning_rate = context.Input("LearningRate"); - bool multi_precision = context.Attr("multi_precision"); - if (param_var->IsType()) { - auto* param = context.Input("Param"); - auto* param_out = context.Output("ParamOut"); - auto* master_param_out = - context.Output("MasterParamOut"); - paddle::optional master_param_opt(paddle::none); - if (multi_precision) { - auto* master_param = context.Input("MasterParam"); - master_param_opt = *master_param; - } - - if (grad_var->IsType()) { - // sgd_dense - auto* grad = context.Input("Grad"); - phi::SGDDenseKernel( - static_cast::TYPE&>(dev_ctx), - *param, - *learning_rate, - *grad, - master_param_opt, - multi_precision, - param_out, - master_param_out); - } else { - // sgd dense param sparse grad - auto* grad = context.Input("Grad"); - phi::SGDDenseParamSparseGradKernel( - static_cast::TYPE&>(dev_ctx), - *param, - *learning_rate, - *grad, - master_param_opt, - multi_precision, - param_out, - master_param_out); - } - } else if (param_var->IsType() && - grad_var->IsType() && - platform::is_cpu_place(context.GetPlace())) { - // sgd sparse param sparse grad - auto* param = context.Input("Param"); - auto* param_out = context.Output("ParamOut"); - auto* master_param_out = - context.Output("MasterParamOut"); - paddle::optional master_param_opt(paddle::none); - if (multi_precision) { - auto* master_param = context.Input("MasterParam"); - master_param_opt = *master_param; - } - auto* grad = context.Input("Grad"); - phi::SGDSparseParamSparseGradKernel( - static_cast::TYPE&>(dev_ctx), - *param, - *learning_rate, - *grad, - master_param_opt, - multi_precision, - param_out, - master_param_out); - - } else { - PADDLE_THROW("gdc not support yet"); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu b/paddle/phi/kernels/cpu/dgc_momentum_kernel.cc similarity index 61% rename from paddle/fluid/operators/optimizers/dgc_momentum_op.cu rename to paddle/phi/kernels/cpu/dgc_momentum_kernel.cc index a36db8621c1ac..39b5f149426d8 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu +++ b/paddle/phi/kernels/cpu/dgc_momentum_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,9 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h" +#include "paddle/phi/kernels/dgc_momentum_kernel.h" -namespace ops = paddle::operators; +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dgc_momentum_kernel_impl.h" -PD_REGISTER_STRUCT_KERNEL( - dgc_momentum, GPU, ALL_LAYOUT, ops::DGCMomentumKernel, float) {} +PD_REGISTER_KERNEL( + dgc_momentum, CPU, ALL_LAYOUT, phi::DGCMomentumKernel, float) {} diff --git a/paddle/phi/kernels/dgc_momentum_kernel.h b/paddle/phi/kernels/dgc_momentum_kernel.h new file mode 100644 index 0000000000000..7e5b549a8ce81 --- /dev/null +++ b/paddle/phi/kernels/dgc_momentum_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DGCMomentumKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& velocity, + const DenseTensor& learning_rate, + const DenseTensor& master_param, + const DenseTensor& current_step_tensor, + const DenseTensor& nranks_tensor, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad, + float rampup_begin_step, + DenseTensor* param_out, + DenseTensor* velocity_out, + DenseTensor* master_param_out, + DenseTensor* grad_out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/dgc_momentum_kernel.cu b/paddle/phi/kernels/gpu/dgc_momentum_kernel.cu new file mode 100644 index 0000000000000..909a4bb0c59fc --- /dev/null +++ b/paddle/phi/kernels/gpu/dgc_momentum_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/dgc_momentum_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/dgc_momentum_kernel_impl.h" + +PD_REGISTER_KERNEL( + dgc_momentum, GPU, ALL_LAYOUT, phi::DGCMomentumKernel, float) {} diff --git a/paddle/phi/kernels/impl/dgc_momentum_kernel_impl.h b/paddle/phi/kernels/impl/dgc_momentum_kernel_impl.h new file mode 100644 index 0000000000000..c79f3cd2dcc67 --- /dev/null +++ b/paddle/phi/kernels/impl/dgc_momentum_kernel_impl.h @@ -0,0 +1,113 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "glog/logging.h" + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/momentum_kernel.h" +#include "paddle/phi/kernels/sgd_kernel.h" + +namespace phi { + +template +void DGCMomentumKernel(const Context& dev_ctx, + const DenseTensor& param, + const DenseTensor& grad, + const DenseTensor& velocity, + const DenseTensor& learning_rate, + const DenseTensor& master_param, + const DenseTensor& current_step_tensor, + const DenseTensor& nranks_tensor, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad, + float rampup_begin_step, + DenseTensor* param_out, + DenseTensor* velocity_out, + DenseTensor* master_param_out, + DenseTensor* grad_out) { + if (static_cast(rampup_begin_step) < 0) { + return; + } + + auto* current_step = current_step_tensor.data(); + + // nranks + const int nranks = static_cast(*nranks_tensor.data()); + PADDLE_ENFORCE_GT( + nranks, + 1, + phi::errors::InvalidArgument( + "DGC is not useful when num_trainers <= 1, but now nranks=%d", + nranks)); + + auto grad_e = phi::EigenVector::Flatten(grad); + auto grad_out_e = phi::EigenVector::Flatten(*grad_out); + + auto& eigen_ctx = *dev_ctx.eigen_device(); + + // NOTE. In dgc_op we multi grad with nranks, so we need /nranks here. + grad_out_e.device(eigen_ctx) = (1.0 / nranks) * grad_e; + + VLOG(10) << "current_step:" << *current_step + << ", rampup_begin_step:" << rampup_begin_step; + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << " so use momentum optimizer"; + + paddle::optional master_param_opt(paddle::none); + + phi::MomentumDenseKernel(dev_ctx, + param, + grad, + velocity, + learning_rate, + master_param_opt, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + multi_precision, + rescale_grad, + param_out, + velocity_out, + master_param_out); + + return; + } + + VLOG(10) << " so use sgd optimizer"; + + paddle::optional master_param_opt(paddle::none); + if (multi_precision) { + master_param_opt = master_param; + } + + phi::SGDDenseKernel(dev_ctx, + param, + learning_rate, + grad, + master_param_opt, + multi_precision, + param_out, + master_param_out); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/dgc_momentum_sig.cc b/paddle/phi/ops/compat/dgc_momentum_sig.cc new file mode 100644 index 0000000000000..607f79f82f924 --- /dev/null +++ b/paddle/phi/ops/compat/dgc_momentum_sig.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DGCMomentumOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "dgc_momentum", + {"Param", + "Grad", + "Velocity", + "LearningRate", + "MasterParam", + "current_step", + "nranks"}, + {"mu", + "use_nesterov", + "regularization_method", + "regularization_coeff", + "multi_precision", + "rescale_grad", + "rampup_begin_step"}, + {"ParamOut", "VelocityOut", "MasterParamOut", "Grad_out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(dgc_momentum, phi::DGCMomentumOpArgumentMapping);