From 468ac6993b322aa3bebbc428a769b7f4edbda787 Mon Sep 17 00:00:00 2001
From: OleNet <olenet@126.com>
Date: Tue, 16 Mar 2021 18:57:17 +0800
Subject: [PATCH] [NPU] add npu kernel for mean Op (#31562)

* update mean op

* update mean op

* give a better test activation

Co-authored-by: oyjxer <1728722986@qq.com>
---
 paddle/fluid/operators/CMakeLists.txt         |   2 +
 paddle/fluid/operators/mean_op_npu.cc         | 135 ++++++++++++++++
 paddle/fluid/operators/mean_op_npu_test.cc    | 133 ++++++++++++++++
 .../tests/unittests/npu/test_mean_op_npu.py   | 149 ++++++++++++++++++
 4 files changed, 419 insertions(+)
 create mode 100644 paddle/fluid/operators/mean_op_npu.cc
 create mode 100644 paddle/fluid/operators/mean_op_npu_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4797b0e7154e0..afe8e6bf18014 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,4 +184,6 @@ endif()
 
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
+cc_test(mean_op_npu_test SRCS mean_op_npu_test.cc DEPS op_registry mean_op scope device_context enforce executor)
 endif()
+
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
new file mode 100644
index 0000000000000..f7dba26604964
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto reduce_ndim = x->dims().size();
+    std::vector<int> axes;
+    for (auto i = 0; i < reduce_ndim; ++i) {
+      axes.push_back(i);
+    }
+
+    framework::NPUAttributeMap attr_input = {
+                  {"keep_dims", false},
+                  {"axes", axes}};
+
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(1);
+    out->Resize(framework::make_ddim(out_dims));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor reduced_out(x->type());
+    std::vector<int64_t> reduced_dout_dims;
+    reduced_dout_dims.push_back(1);
+    reduced_out.Resize(framework::make_ddim(reduced_dout_dims));
+    reduced_out.mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("ReduceMeanD",
+                              {*x},
+                              {*out},
+                              attr_input);
+
+    auto stream =
+      ctx.template device_context<
+                     paddle::platform::NPUDeviceContext>()
+                .stream();
+    runner.Run(stream);
+  }
+};
+
+
+template <typename DeviceContext, typename T>
+class MeanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto stream =
+      context.template device_context<
+                          paddle::platform::NPUDeviceContext>()
+                          .stream();
+
+    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          grad->numel()));
+
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    // ones
+    Tensor ones(grad->type());
+    std::vector<int64_t> dout_dims;
+    for (auto i = 0; i < IG->dims().size(); ++i) {
+      dout_dims.push_back(IG->dims()[i]);
+    }
+    ones.Resize(framework::make_ddim(dout_dims));
+    ones.mutable_data<T>(context.GetPlace());
+    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    runner_ones.Run(stream);
+
+    // means
+    Tensor mean_tensor(grad->type());
+    mean_tensor.Resize({1});
+    mean_tensor.mutable_data<T>(context.GetPlace());
+    std::vector<float> mean_vec;
+    mean_vec.push_back(1.0/static_cast<float>(IG->numel()));
+    framework::TensorFromVector(mean_vec,
+                                context.device_context(),
+                                &mean_tensor);
+
+    // means mul ones
+    Tensor mean_ma(grad->type());
+    mean_ma.Resize(framework::make_ddim(dout_dims));
+    mean_ma.mutable_data<T>(context.GetPlace());
+    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    runner_mul_1.Run(stream);
+
+    // and mul grad
+    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+
+
+REGISTER_OP_NPU_KERNEL(
+    mean_grad,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/mean_op_npu_test.cc b/paddle/fluid/operators/mean_op_npu_test.cc
new file mode 100644
index 0000000000000..7379955084d1b
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_npu_test.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(mean);
+USE_OP_DEVICE_KERNEL(mean, NPU);
+USE_OP(mean_grad);
+USE_OP_DEVICE_KERNEL(mean_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+  init.push_back(static_cast<T>(2.0));
+  init.push_back(static_cast<T>(3.0));
+  init.push_back(static_cast<T>(4.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({4});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  auto op = f::OpRegistry::CreateOp(op_type,
+                           {{"X", {"X"}}},
+                           {{"Out", {"Out"}}},
+                           {});
+
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
+  EXPECT_EQ((float)out_vec[0], (float)2.5);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+  float dvalue = 2.0;
+  tensor_dout->Resize({1});
+  std::vector<T> init_dout;
+  init_dout.push_back(static_cast<T>(dvalue));
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  ctx.Wait();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({4});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+  tensor_dx->Resize({4});
+
+  ctx.Wait();
+
+  auto op = f::OpRegistry::CreateOp(op_type,
+                                    {{"Out@GRAD", {"DOut"}},
+                                     {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}},
+                                    {});
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_dx, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
+  EXPECT_EQ((float)out_vec[0], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[1], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[2], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[3], (float)1.0/dvalue);
+}
+
+TEST(mean, NPU_fp32) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    Compare<float>(&scope, ctx, "mean");
+}
+
+
+TEST(mean_grad, NPU_fp32) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    CompareGrad<float>(&scope, ctx, "mean_grad");
+}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
new file mode 100644
index 0000000000000..f949f5ccf66d1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -0,0 +1,149 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMeanNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='sigmoid')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
+