PaddlePaddle · zhiqiu · May 13, 2021 · May 7, 2021 · May 7, 2021 · May 7, 2021
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -198,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) "
                   "Whether to use multi-precision during weight updating.")
         .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Adam Optimizer.
@@ -246,4 +253,16 @@ REGISTER_OP_VERSION(adam)
             "EpsilonTensor",
             "If provided, Adam will use this as epsilon, "
             "this has a higher priority than attr(epsilon). "
-            "For better performance in npu kernel. "));
+            "For better performance in npu kernel. "))
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade adam, add 1 attribute [use_global_beta_pow].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "use_global_beta_pow",
+            "If true, Adam will use global beta_pow for whole model "
+            "instead of creating beta_pow for each parameter."
+            "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
+            "used in adam op, "
+            "and beta_pow will be updated after all adam op in the model.",
+            false));
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -154,6 +154,8 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -254,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Cpu update
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Cpu update
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -269,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, param->numel());
-        // Update with gpu
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // Update with gpu
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
-
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -328,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
             param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
             master_out_data, rows, row_numel, grad_merge.rows().size(),
             lazy_mode, ndim);
-        // Update with cpu
-        beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow->data<MPDType>()[0];
-        beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow->data<MPDType>()[0];
+        if (!use_global_beta_pow) {
+          // Update with cpu
+          beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow->data<MPDType>()[0];
+          beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow->data<MPDType>()[0];
+        }
       } else {
         SparseAdamFunctor<T, GPUAdam, MPDType> functor(
             beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
@@ -351,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                 ctx.device_context()),
             param->numel());
         for_range(functor);
-        // update beta1 and beta2
-        UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
-            beta1, beta2, beta1_pow->data<MPDType>(),
-            beta2_pow->data<MPDType>(),
-            beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
-            beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        if (!use_global_beta_pow) {
+          // update beta1 and beta2
+          UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
+              beta1, beta2, beta1_pow->data<MPDType>(),
+              beta2_pow->data<MPDType>(),
+              beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
+              beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
+        }
       }
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(

diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
@@ -406,6 +406,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
 
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
@@ -475,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           lr->data<T>(), grad->data<T>(), param->data<T>(),
           param_out->mutable_data<T>(ctx.GetPlace()));
       functor(param->numel());
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
-
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto* grad = ctx.Input<framework::SelectedRows>("Grad");
       if (grad->rows().size() == 0) {
@@ -523,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
           param_out->mutable_data<T>(ctx.GetPlace()), rows, row_numel,
           grad_merge.rows().size(), lazy_mode);
       // update beta1 and beta2
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta1 * beta1_pow->data<T>()[0];
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-          beta2 * beta2_pow->data<T>()[0];
+      if (!use_global_beta_pow) {
+        beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta1 * beta1_pow->data<T>()[0];
+        beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
+            beta2 * beta2_pow->data<T>()[0];
+      }
       if (lazy_mode) {
         VLOG(3) << "run cpu lazy mode";
         size_t row_count = grad_merge.rows().size();

diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -36,7 +36,6 @@ class AdamNPUKernel : public framework::OpKernel<T> {
                           "but the received is %s",
                           ctx.InputNames("Param").front(),
                           framework::ToTypeName(param_var->Type())));
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
@@ -50,34 +49,37 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto* mom2 = ctx.Input<LoDTensor>("Moment2");
     auto* lr = ctx.Input<LoDTensor>("LearningRate");
 
-    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+    auto* beta1_pow = ctx.Input<Tensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<Tensor>("Beta2Pow");
 
     auto* param_out = ctx.Output<LoDTensor>("ParamOut");
     auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
     auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
     auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
     auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     param_out->mutable_data<T>(ctx.GetPlace());
     mom1_out->mutable_data<T>(ctx.GetPlace());
     mom2_out->mutable_data<T>(ctx.GetPlace());
 
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
+    // place.
+    LoDTensor beta1_pow_tmp;
+    LoDTensor beta2_pow_tmp;
     if (beta1_pow->place() == platform::CPUPlace()) {
       T beta1 = *beta1_pow->data<T>();
-      // `mutable_data` operation needs to be done after getting data
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
-    } else {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
+      beta1_pow = &beta1_pow_tmp;
     }
     if (beta2_pow->place() == platform::CPUPlace()) {
       T beta2 = *beta2_pow->data<T>();
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
-    } else {
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
+      beta2_pow = &beta2_pow_tmp;
     }
 
     const Tensor* beta1_tensor = nullptr;
@@ -174,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel<T> {
           *mom2, ctx.GetPlace(),
           ctx.template device_context<platform::DeviceContext>(), mom2_out);
     }
-    auto runner_m1 =
-        NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-    runner_m1.Run(stream);
-    auto runner_m2 =
-        NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-    runner_m2.Run(stream);
+    if (!use_global_beta_pow) {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      auto runner_m1 =
+          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
+      runner_m1.Run(stream);
+      auto runner_m2 =
+          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
+      runner_m2.Run(stream);
+    }
   }
 };
 

diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -73,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           "value is:%d.",
                           beta2_pow_out->numel()));
 
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     if (ctx.HasInput("Beta1Tensor")) {
       auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
@@ -111,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
           mom1_out.template mutable_data<T>(ctx.GetPlace()),
           mom2_out.template mutable_data<T>(ctx.GetPlace()),
           param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
-
-      // update in cpu and then copy to xpu
-      if (beta1_pow.place() == platform::CPUPlace() &&
-          beta2_pow.place() == platform::CPUPlace()) {
-        const T* beta1_pow_p = beta1_pow.template data<T>();
-        beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta1 * beta1_pow_p[0];
-        const T* beta2_pow_p = beta2_pow.template data<T>();
-        beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
-            beta2 * beta2_pow_p[0];
-      } else {
-        T cpu_beta1_pow_out_data;
-        T cpu_beta2_pow_out_data;
-        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
-                     beta1_pow_ptr, sizeof(T));
-
-        cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
-                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
-                     beta2_pow_ptr, sizeof(T));
-
-        cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
-
-        T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-        T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta1_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta1_pow_out_data, sizeof(T));
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
-                     beta2_pow_out_p, platform::CPUPlace(),
-                     &cpu_beta2_pow_out_data, sizeof(T));
+      if (!use_global_beta_pow) {
+        // update in cpu and then copy to xpu
+        if (beta1_pow.place() == platform::CPUPlace() &&
+            beta2_pow.place() == platform::CPUPlace()) {
+          const T* beta1_pow_p = beta1_pow.template data<T>();
+          beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta1 * beta1_pow_p[0];
+          const T* beta2_pow_p = beta2_pow.template data<T>();
+          beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+              beta2 * beta2_pow_p[0];
+
+        } else {
+          T cpu_beta1_pow_out_data;
+          T cpu_beta2_pow_out_data;
+
+          memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                       beta1_pow_ptr, sizeof(T));
+
+          cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
+          memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                       BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                       beta2_pow_ptr, sizeof(T));
+
+          cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
+
+          T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+          T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta1_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta1_pow_out_data, sizeof(T));
+          memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                       beta2_pow_out_p, platform::CPUPlace(),
+                       &cpu_beta2_pow_out_data, sizeof(T));
+        }
+
+        PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
+                          platform::errors::External(
+                              "XPU API return wrong value[%d], please check "
+                              "where Baidu Kunlun Card is properly installed.",
+                              r));
       }
-
-      PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
-                        platform::errors::External(
-                            "XPU API return wrong value[%d], please check "
-                            "where Baidu Kunlun Card is properly installed.",
-                            r));
     } else {
       PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument(
                                   "Variable type not supported by adam_op"));