Merge branch 'develop' into opt/aligned_vector

PaddlePaddle · Sep 2, 2021 · 0dfc8f7 · 0dfc8f7 · paddle-bot-old · Sep 3, 2021
2 parents c202180 + bb63396
commit 0dfc8f7
Show file tree

Hide file tree

Showing 51 changed files with 6,615 additions and 676 deletions.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -183,6 +183,7 @@ function(op_library TARGET)
         list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
         list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
+        list(REMOVE_ITEM hip_srcs "svd_op.cu")
         list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
         list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}

diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
+DECLARE_int32(min_loss_scaling);
+
 namespace paddle {
 namespace operators {
 
@@ -49,7 +51,7 @@ void Update(const platform::NPUDeviceContext& ctx,
 
     std::vector<int> bad_out_data;
     TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
-    if (bad_out_data[0] == decr_every_n_nan_or_inf) {
+    if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
       const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
                                           {*updated_loss_scaling_tensor},
                                           {{"power", static_cast<float>(1)},
@@ -60,13 +62,18 @@ void Update(const platform::NPUDeviceContext& ctx,
 
       std::vector<T> new_loss_scaling;
       TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
-      if (new_loss_scaling[0] < static_cast<T>(1)) {
+      float min_value = 1.0;
+      if (FLAGS_min_loss_scaling > 1) {
+        min_value = static_cast<float>(FLAGS_min_loss_scaling);
+      }
+
+      if (new_loss_scaling[0] < min_value) {
         // updated_loss_scaling_data = 1
-        const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                            {*updated_loss_scaling_tensor},
-                                            {{"power", static_cast<float>(1)},
-                                             {"scale", static_cast<float>(0)},
-                                             {"shift", static_cast<float>(1)}});
+        const auto& runner_p4 = NpuOpRunner(
+            "Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
+            {{"power", static_cast<float>(1)},
+             {"scale", static_cast<float>(0)},
+             {"shift", static_cast<float>(min_value)}});
 
         runner_p4.Run(stream);
       }
@@ -93,7 +100,7 @@ void Update(const platform::NPUDeviceContext& ctx,
     std::vector<int> good_out_data;
     TensorToVector(*good_out_tensor, ctx, &good_out_data);
 
-    if (good_out_data[0] == incr_every_n_steps) {
+    if (good_out_data[0] >= incr_every_n_steps) {
       const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
                                           {*updated_loss_scaling_tensor},
                                           {{"power", static_cast<float>(1)},

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -85,6 +85,14 @@ class ElementwiseOp : public framework::OperatorWithKernel {
       auto y_dims = ctx->GetInputDim("Y");
       int max_dim = std::max(x_dims.size(), y_dims.size());
       int axis = ctx->Attrs().Get<int>("axis");
+      if (x_dims.size() == y_dims.size()) {
+        PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0), true,
+                          platform::errors::InvalidArgument(
+                              "axis should be -1 or 0 while the dimension of "
+                              "tensor X (%s) is equal to the dimension of "
+                              "tensor Y (%s), but received axis: %s",
+                              x_dims.size(), y_dims.size(), axis));
+      }
       PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), true,
                         platform::errors::InvalidArgument(
                             "The axis range must be [%s, %s), but axis is %s. "

diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_nd_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GatherNdNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->template mutable_data<T>(ctx.GetPlace());
+
+    if (x->numel() == 0) return;
+
+    if (index->numel() == 0) {
+      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
+      return;
+    }
+
+    const auto &index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s]",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GatherNdGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *p = dx->mutable_data<T>(ctx.GetPlace());
+
+    if (dx->numel() == 0) return;
+
+    if (index->numel() == 0) {
+      framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx);
+      return;
+    }
+
+    framework::Tensor tmp_tensor(index->type());
+    framework::Tensor tmp_tensor2(dout->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {1, index_dims[0]};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+
+      tmp_tensor2.ShareDataWith(*dout);
+      std::vector<int64_t> new_dim2{1};
+      for (int i = index->numel(); i < x->dims().size(); i++) {
+        new_dim2.push_back(x->dims()[i]);
+      }
+      tmp_tensor2.Resize(framework::make_ddim(new_dim2));
+      dout = &tmp_tensor2;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0, dx->numel() * sizeof(T),
+                             stream);
+
+    const auto &runner_scatter = NpuOpRunner(
+        "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}});
+    runner_scatter.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    gather_nd, ops::GatherNdNPUKernel<paddle::platform::NPUDeviceContext,
+                                      paddle::platform::float16>,
+    ops::GatherNdNPUKernel<paddle::platform::NPUDeviceContext, float>);
+
+REGISTER_OP_NPU_KERNEL(
+    gather_nd_grad,
+    ops::GatherNdGradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>,
+    ops::GatherNdGradNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
@@ -100,7 +100,8 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>(
         "use_cudnn",
         "(bool, default true) Only used in cudnn kernel, need install cudnn")
-        .SetDefault(true);
+        .SetDefault(true)
+        .AsExtra();
 
     AddAttr<bool>(
         "align_corners",

diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/label_smooth_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+void LabelSmoothMuls(const platform::Place& place, const aclrtStream& stream,
+                     const Tensor* in, float val, Tensor* out) {
+  out->mutable_data<T>(in->dims(), place);
+  const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}});
+  runner.Run(stream);
+}
+
+template <typename T>
+void LabelSmoothAdds(const platform::Place& place, const aclrtStream& stream,
+                     const Tensor* in, float val, Tensor* out) {
+  out->mutable_data<T>(in->dims(), place);
+  const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}});
+  runner.Run(stream);
+}
+
+template <typename T>
+void LabelSmoothAddBroadCast(const platform::Place& place,
+                             const aclrtStream& stream, const Tensor* in1,
+                             const Tensor* in2, Tensor* out) {
+  out->mutable_data<T>(place);
+  const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {});
+  runner.Run(stream);
+}
+
+template <typename T>
+class LabelSmoothNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_t = ctx.Output<LoDTensor>("Out");
+    auto* in_t = ctx.Input<LoDTensor>("X");
+    auto* dist_t = ctx.Input<Tensor>("PriorDist");
+    auto epsilon = ctx.Attr<float>("epsilon");
+
+    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (dist_t) {
+      Tensor tmp;
+      Tensor dist;
+      Tensor tmp2;
+      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
+      LabelSmoothMuls<T>(place, stream, dist_t, epsilon, &tmp2);
+      tmp2.Resize({1, label_dim});
+      LabelSmoothAddBroadCast<T>(place, stream, &tmp, &tmp2, out_t);
+    } else {
+      Tensor tmp;
+      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
+      LabelSmoothAdds<T>(place, stream, &tmp, (epsilon / label_dim), out_t);
+    }
+  }
+};
+
+template <typename T>
+class LabelSmoothGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto epsilon = ctx.Attr<float>("epsilon");
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    LabelSmoothMuls<T>(place, stream, d_out_t, 1 - epsilon, d_in_t);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(label_smooth, ops::LabelSmoothNPUKernel<float>,
+                       ops::LabelSmoothNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(label_smooth_grad, ops::LabelSmoothGradNPUKernel<float>,
+                       ops::LabelSmoothGradNPUKernel<plat::float16>);