diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index d3d35cb3ae7e5..e068d2a9a5181 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -62,7 +62,6 @@ function(op_library TARGET)
   set(hip_cc_srcs)
   set(xpu_cc_srcs)
   set(xpu_kp_cc_srcs)
-  set(npu_cc_srcs)
   set(mlu_cc_srcs)
   set(cudnn_cu_cc_srcs)
   set(miopen_cu_cc_srcs)
@@ -320,12 +319,7 @@ function(op_library TARGET)
     if(WITH_UNITY_BUILD AND op_library_UNITY)
       # Combine the cc source files.
       compose_unity_target_sources(
-        ${UNITY_TARGET}
-        cc
-        ${cc_srcs}
-        ${mkldnn_cc_srcs}
-        ${xpu_cc_srcs}
-        ${npu_cc_srcs}
+        ${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}
         ${mlu_cc_srcs})
       if(TARGET ${UNITY_TARGET})
         # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
@@ -339,12 +333,6 @@ function(op_library TARGET)
       endif()
       # Add alias library to handle dependencies.
       add_library(${TARGET} ALIAS ${UNITY_TARGET})
-    else()
-      cc_library(
-        ${TARGET}
-        SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs}
-             ${mlu_cc_srcs}
-        DEPS ${op_library_DEPS} ${op_common_deps})
     endif()
   endif()
 
@@ -355,7 +343,6 @@ function(op_library TARGET)
   list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
   list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
   list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
-  list(LENGTH npu_cc_srcs npu_cc_srcs_len)
   list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
 
   # Define operators that don't need pybind here.
@@ -590,7 +577,6 @@ function(register_operators)
     "*_op.cc")
   string(REPLACE "_mkldnn" "" OPS "${OPS}")
   string(REPLACE "_xpu" "" OPS "${OPS}")
-  string(REPLACE "_npu" "" OPS "${OPS}")
   string(REPLACE "_mlu" "" OPS "${OPS}")
   string(REPLACE ".cc" "" OPS "${OPS}")
   list(REMOVE_DUPLICATES OPS)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 4955d47ca77e5..27842543c5902 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,12 +100,12 @@ register_operators(EXCLUDES py_func_op warpctc_op dgc_op generated_op1 generated
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
-op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS})
-target_link_libraries(run_program_op cuda_graph_with_memory_pool)
+
 op_library(quantize_linear_op DEPS phi)
 op_library(save_combine_op DEPS string_array phi)
 op_library(load_combine_op DEPS string_array)
 
+
 if (WITH_GPU OR WITH_ROCM)
     op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)
@@ -179,10 +179,7 @@ if (WITH_ASCEND)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper)
 endif()
 
-if (WITH_ASCEND_CL)
-  cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS generated_static_op)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
-endif()
+
 
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(unsqueeze_op DEPS reshape_op)
@@ -218,18 +215,13 @@ if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
 endif()
 
-if (WITH_ASCEND_CL)
-  cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op)
-endif()
+
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test_old(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op ${COMMON_OP_DEPS})
-if (WITH_ASCEND_CL)
-    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
-endif()
+
 
 
 if(WITH_MKLDNN)
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
deleted file mode 100644
index 0a859d1f564a9..0000000000000
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AbsNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Abs",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AbsGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    abs,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    abs_grad,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
deleted file mode 100644
index 9f3392f2eabc5..0000000000000
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ /dev/null
@@ -1,1116 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PowNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto factor = ctx.Attr<float>("factor");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Power",
-                                     {*x},
-                                     {*out},
-                                     {{"power", factor},
-                                      {"scale", static_cast<float>(1.0)},
-                                      {"shift", static_cast<float>(0.0)}});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PowGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto factor = ctx.Attr<float>("factor");
-
-    auto x_dims = x->dims();
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
-
-    // Step1: Compute x_pow = x.pow(factor-1)
-    phi::DenseTensor x_pow(x->type());
-    x_pow.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow = NpuOpRunner(
-        "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
-    runner_pow.Run(stream);
-
-    // Step 2: Construct a broadcast factor, which has the same shape with x.
-
-    // 2.1 Get a factor tensor with shape [1].
-    phi::DenseTensor factor_tensor(phi::DataType::FLOAT32);
-    factor_tensor.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&factor_tensor, factor);
-
-    // 2.2 Get the factor which has the shape with x and the same value with
-    // factor.
-    phi::DenseTensor factor_bc_tensor(phi::DataType::FLOAT32);
-    factor_bc_tensor.mutable_data<float>(x_dims, place);
-    const auto& runner_bc = NpuOpRunner("FillD",
-                                        {factor_tensor},
-                                        {factor_bc_tensor},
-                                        {{"dims", phi::vectorize(x_dims)}});
-    runner_bc.Run(stream);
-
-    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
-    phi::DenseTensor x_power_mul_factor(x->type());
-    x_power_mul_factor.mutable_data<T>(x->dims(), place);
-    const auto& runner_mul_1 =
-        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
-    runner_mul_1.Run(stream);
-
-    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
-    dx->mutable_data<T>(place);
-    const auto& runner_mul_2 =
-        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Relu",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu6NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Relu6",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu6GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqrtNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LeakyReluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto alpha = ctx.Attr<float>("alpha");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("LeakyRelu", {*x}, {*out}, {{"negative_slope", alpha}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LeakyReluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto alpha = ctx.Attr<float>("alpha");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner(
-        "LeakyReluGrad", {*dout, *x}, {*dx}, {{"negative_slope", alpha}});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqrtGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor one(x->type());
-    one.mutable_data<T>(x->dims(), place);
-    const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
-    runner_one.Run(stream);
-
-    phi::DenseTensor sub(x->type());
-    sub.mutable_data<T>(x->dims(), place);
-    const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
-    runner_sub.Run(stream);
-
-    const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
-    runner_out.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TanhNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TanhGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquareNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquareGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto factor = static_cast<float>(2.0);
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // Step 1: Compute x_muls_factor = factor * x
-    phi::DenseTensor x_muls_factor(x->type());
-    x_muls_factor.mutable_data<T>(x->dims(), place);
-    const auto& runner_muls_1 =
-        NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}});
-    runner_muls_1.Run(stream);
-
-    // Step 2: Compute dx = dout * factor * x
-    dx->mutable_data<T>(place);
-    const auto& runner_mul_2 =
-        NpuOpRunner("Mul", {*dout, x_muls_factor}, {*dx}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sigmoid", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx =
-        NpuOpRunner("SigmoidGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-// Swish = x * sigmoid(beta * x)
-template <typename T>
-class SwishNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    float beta = ctx.Attr<float>("beta");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*x}, {*out}, {{"value", beta}});
-    muls_runner.Run(stream);
-
-    const auto& sigmoid_runner = NpuOpRunner("Sigmoid", {*out}, {*out}, {});
-    sigmoid_runner.Run(stream);
-
-    const auto& mul_runner = NpuOpRunner("Mul", {*x, *out}, {*out});
-    mul_runner.Run(stream);
-  }
-};
-
-template <typename T>
-class SwishGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float beta = ctx.Attr<float>("beta");
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor beta_x, sigmoid_out, swish_out;
-    beta_x.mutable_data<T>(x->dims(), ctx.GetPlace());
-    sigmoid_out.mutable_data<T>(x->dims(), ctx.GetPlace());
-    swish_out.mutable_data<T>(x->dims(), ctx.GetPlace());
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}});
-    muls_runner.Run(stream);
-
-    const auto& sigmoid_runner =
-        NpuOpRunner("Sigmoid", {beta_x}, {sigmoid_out}, {});
-    sigmoid_runner.Run(stream);
-
-    const auto& mul_runner =
-        NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {});
-    mul_runner.Run(stream);
-    const auto& muls_runner2 =
-        NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}});
-    muls_runner2.Run(stream);
-
-    const auto& mul_runner1 =
-        NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {});
-    mul_runner1.Run(stream);
-
-    const auto& sub_runner = NpuOpRunner("Sub", {swish_out, *dx}, {*dx}, {});
-    sub_runner.Run(stream);
-
-    const auto& add_runner = NpuOpRunner("Add", {sigmoid_out, *dx}, {*dx}, {});
-    add_runner.Run(stream);
-
-    const auto& mul_runner2 = NpuOpRunner("Mul", {*dout, *dx}, {*dx}, {});
-    mul_runner2.Run(stream);
-  }
-};
-
-// HardSwish = min(max(0, x+offset), threshold) * x / scale
-template <typename T>
-class HardSwishNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_offset(x->type());
-    tensor_offset.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
-
-    phi::DenseTensor add_offset_val(x->type());
-    add_offset_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
-    runner_add.Run(stream);
-
-    phi::DenseTensor tensor_threshold(x->type());
-    tensor_threshold.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast<T>(threshold));
-
-    phi::DenseTensor tensor_zero(x->type());
-    tensor_zero.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_zero, static_cast<T>(0.0));
-
-    phi::DenseTensor clip_val(x->type());
-    clip_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_clip =
-        NpuOpRunner("ClipByValue",
-                    {add_offset_val, tensor_zero, tensor_threshold},
-                    {clip_val});
-    runner_clip.Run(stream);
-
-    phi::DenseTensor tensor_scale_tmp(x->type());
-    tensor_scale_tmp.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast<T>(scale));
-    phi::DenseTensor tensor_scale(x->type());
-    tensor_scale.mutable_data<T>(x->dims(), place);
-    const auto& runner_fill =
-        NpuOpRunner("FillD",
-                    {tensor_scale_tmp},
-                    {tensor_scale},
-                    {{"dims", phi::vectorize(x->dims())}});
-    runner_fill.Run(stream);
-
-    phi::DenseTensor div_val(x->type());
-    div_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_div =
-        NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val});
-    runner_div.Run(stream);
-
-    const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out});
-    runner_mul.Run(stream);
-  }
-};
-
-template <typename T>
-class HardSwishGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_offset(x->type());
-    tensor_offset.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
-
-    phi::DenseTensor add_offset_val(x->type());
-    add_offset_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
-    runner_add.Run(stream);
-
-    phi::DenseTensor tmp1(x->type());
-    tmp1.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow1 = NpuOpRunner(
-        "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}});
-    runner_pow1.Run(stream);
-
-    phi::DenseTensor tmp2(x->type());
-    tmp2.mutable_data<T>(x->dims(), place);
-    const auto& runner_ht_grad =
-        NpuOpRunner("HardtanhGrad",
-                    {add_offset_val, tmp1},
-                    {tmp2},
-                    {{"min_val", 0.0f}, {"max_val", threshold}});
-    runner_ht_grad.Run(stream);
-
-    phi::DenseTensor tmp3(x->type());
-    tmp3.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow2 = NpuOpRunner(
-        "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}});
-    runner_pow2.Run(stream);
-
-    phi::DenseTensor tensor_threshold_tmp(x->type());
-    tensor_threshold_tmp.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
-                                 static_cast<T>(threshold));
-    phi::DenseTensor tensor_threshold(x->type());
-    tensor_threshold.mutable_data<T>(x->dims(), place);
-    const auto& runner_fill =
-        NpuOpRunner("FillD",
-                    {tensor_threshold_tmp},
-                    {tensor_threshold},
-                    {{"dims", phi::vectorize(x->dims())}});
-    runner_fill.Run(stream);
-
-    phi::DenseTensor tmp_bool(phi::DataType::BOOL);
-    tmp_bool.mutable_data<bool>(x->dims(), place);
-    const auto& runner_less =
-        NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool});
-    runner_less.Run(stream);
-    phi::DenseTensor tmp4(x->type());
-    tmp4.mutable_data<T>(x->dims(), place);
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-    const auto& runner_cast =
-        NpuOpRunner("Cast",
-                    {tmp_bool},
-                    {tmp4},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast.Run(stream);
-
-    phi::DenseTensor tmp5(x->type());
-    tmp5.mutable_data<T>(x->dims(), place);
-    const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5});
-    runner_sub.Run(stream);
-
-    const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx});
-    runner_final.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HardSigmoidNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {{"alpha", slope},
-                                             {"beta", offset}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("HardSigmoid", {*x}, {*out}, attr_input);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {{"alpha", slope},
-                                             {"beta", offset}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx =
-        NpuOpRunner("HardSigmoidGrad", {*dout, *out}, {*dx}, attr_input);
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReciprocalNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Reciprocal", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_dx =
-        NpuOpRunner("ReciprocalGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-
-    phi::DenseTensor sin_out(x->type());  // Temporary phi::DenseTensor
-    sin_out.Resize(x->dims());
-    sin_out.mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {});
-    runner.Run(stream);
-
-    const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {});
-    runner_dx.Run(stream);
-
-    phi::DenseTensor tmp(x->type());  // Temporary phi::DenseTensor
-    tmp.Resize(phi::make_ddim({1, 1}));
-    tmp.mutable_data<T>(place);
-    float factor = -1.;
-    FillNpuTensorWithConstant<T>(&tmp, static_cast<T>(factor));
-
-    const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {});
-    runner_dx_.Run(stream);
-    // dx = -dout * Sine(x);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AtanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AtanGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Mul", {*dout, *out}, {*dx}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sin", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    pow,
-    ops::PowNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::PowNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    pow_grad,
-    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu,
-    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu_grad,
-    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu6,
-    ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu6_grad,
-    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    leaky_relu,
-    ops::LeakyReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LeakyReluNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    leaky_relu_grad,
-    ops::LeakyReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LeakyReluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sqrt,
-    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sqrt_grad,
-    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    log,
-    ops::LogNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LogNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    log_grad,
-    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    tanh,
-    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    tanh_grad,
-    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    square,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(
-    square_grad,
-    ops::SquareGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sigmoid,
-    ops::SigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SigmoidNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_grad,
-    ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(swish,
-                       ops::SwishNPUKernel<float>,
-                       ops::SwishNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(swish_grad,
-                       ops::SwishGradNPUKernel<float>,
-                       ops::SwishGradNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(hard_swish,
-                       ops::HardSwishNPUKernel<float>,
-                       ops::HardSwishNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(hard_swish_grad,
-                       ops::HardSwishGradNPUKernel<float>,
-                       ops::HardSwishGradNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    hard_sigmoid,
-    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    hard_sigmoid_grad,
-    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                  paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    reciprocal,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    reciprocal_grad,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    cos,
-    ops::CosNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CosNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    cos_grad,
-    ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    atan,
-    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    atan_grad,
-    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    exp,
-    ops::ExpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpNPUKernel<paddle::platform::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    exp_grad,
-    ops::ExpGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpGradNPUKernel<paddle::platform::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    sin,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
index cbedb02f86836..cbd9c8b2768b4 100644
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
@@ -4,11 +4,3 @@ if(WITH_UNITY_BUILD)
   include(unity_build_rule.cmake)
 endif()
 register_operators()
-
-if(WITH_ASCEND_CL)
-  cc_test(
-    check_finite_and_unscale_op_npu_test
-    SRCS check_finite_and_unscale_op_npu_test.cc
-    DEPS op_registry check_finite_and_unscale_op scope device_context enforce
-         executor)
-endif()
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
deleted file mode 100644
index 424c2326ab201..0000000000000
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllocFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* float_status = ctx.Output<phi::DenseTensor>("FloatStatus");
-    float_status->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    alloc_float_status,
-    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
deleted file mode 100644
index 63e16fb357058..0000000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
-// On NPU, we do not really check the data of input tensors,
-// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
-// and clear it after this op.
-// Which may leads to wrong result if the input tensors is not calculated
-// on NPU device, but got from other way, for example, feeding.
-template <typename T>
-class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
-
-    found_inf->mutable_data<bool>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // step1: inverse scale
-    phi::DenseTensor const_tensor;
-    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
-
-    // Inverse(1.0/scale)
-    phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
-    phi::DenseTensor inverse_out(scale->type());
-    inverse_out.Resize(scale->dims());
-    inverse_out.mutable_data<T>(ctx.GetPlace());
-    const auto& runner_inverse =
-        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
-    runner_inverse.Run(stream);
-    tmp_inverse_out = &inverse_out;
-
-    // NOTE(zhiqiu):
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
-    // tmp is only placeholder.
-    const auto& runner_float_status =
-        NpuOpRunner("NPUGetFloatStatus",
-                    {*float_status},
-                    {tmp},
-                    {{"message", std::string("check_nan_and_inf")}});
-    runner_float_status.Run(stream);
-
-    phi::DenseTensor sum;
-    sum.mutable_data<float>({1}, ctx.GetPlace());
-    const auto& runner_reduce_sum =
-        NpuOpRunner("ReduceSumD",
-                    {*float_status},
-                    {sum},
-                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
-    runner_reduce_sum.Run(stream);
-
-    const auto& runner_greater =
-        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
-    runner_greater.Run(stream);
-
-    // NOTE(zhiqiu): The normal logic is :
-    // out = in, if found_inf = true
-    // out = in/scale, if found_inf = false
-    // However, on NPU, in order to avoid stream sync, we do not copy the
-    // found_inf data to cpu to check whether to unscale or not.
-    // Instead, we do the Mul no matter found_inf or not.
-    // And, a fact is, only few steps contains nan/inf during training.
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner_mul =
-          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-      runner_mul.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(check_finite_and_unscale,
-                       ops::CheckFiniteAndUnscaleNPUKernel<float>,
-                       ops::CheckFiniteAndUnscaleNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
deleted file mode 100644
index bf7272ba8b878..0000000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(check_finite_and_unscale);
-USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
-
-struct InputVars {
-  std::string name;
-  phi::DenseTensor *tensor;
-};
-
-template <typename T>
-void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
-  const f::DDim dims = phi::make_ddim({2, 2});
-  auto place = ctx.GetPlace();
-
-  // init input
-  std::vector<InputVars> input_names = {
-      {"x", scope->Var("x")->GetMutable<phi::DenseTensor>()},
-      {"x1", scope->Var("x1")->GetMutable<phi::DenseTensor>()}};
-
-  auto *scale = scope->Var("scale")->GetMutable<phi::DenseTensor>();
-
-  // init output
-  auto *out = scope->Var("out")->GetMutable<phi::DenseTensor>();
-  auto *out1 = scope->Var("out1")->GetMutable<phi::DenseTensor>();
-  auto *found_inf = scope->Var("found_inf")->GetMutable<phi::DenseTensor>();
-
-  // Initialize input data
-  const int num_inputs = input_names.size();
-  size_t numel = static_cast<size_t>(phi::product(dims));
-
-  for (int i = 0; i < num_inputs; ++i) {
-    std::vector<T> init_xs;
-    for (size_t j = 0; j < numel; ++j) {
-      if (j == 0) {
-        init_xs.push_back(static_cast<T>(NAN));
-      } else {
-        init_xs.push_back(static_cast<T>(j + 1));
-      }
-    }
-    f::TensorFromVector(init_xs, ctx, input_names[i].tensor);
-    input_names[i].tensor->Resize(dims);
-  }
-
-  f::TensorFromVector(std::vector<T>{static_cast<T>(0.5)}, ctx, scale);
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      "check_finite_and_unscale",
-      {{"X", {"x", "x1"}}, {"Scale", {"scale"}}},
-      {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}},
-      attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  // out0
-  std::vector<T> out_vec;
-  f::TensorToVector(*out, ctx, &out_vec);
-  EXPECT_EQ(out_vec.size(), static_cast<size_t>(4));
-  for (size_t j = 0; j < out_vec.size(); ++j) {
-    VLOG(3) << "out_vec[" << j << "]:" << out_vec[j];
-  }
-
-  ctx.Wait();
-
-  // out0
-  std::vector<T> out1_vec;
-  f::TensorToVector(*out1, ctx, &out1_vec);
-  EXPECT_EQ(out1_vec.size(), static_cast<size_t>(4));
-  for (size_t j = 0; j < out1_vec.size(); ++j) {
-    VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j];
-  }
-
-  ctx.Wait();
-
-  // out found_inf
-  phi::DenseTensor found_inf_tensor;
-  found_inf_tensor.Resize({1});
-  bool *found_inf_data =
-      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-  f::TensorCopy(*found_inf, place, &found_inf_tensor);
-  EXPECT_TRUE(*found_inf_data);
-
-  ctx.Wait();
-}
-
-TEST(check_finite_and_unscale, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(check_finite_and_unscale, NPU_fp16) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
deleted file mode 100644
index 1f3e54421f020..0000000000000
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ClearFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
-    // NOTE(zhiqiu): NPUClearFloatStatus modifies the input.
-    PADDLE_ENFORCE_EQ(float_status_out,
-                      float_status,
-                      platform::errors::PreconditionNotMet(
-                          "The input(FloatStatus) and Output(FloatStatusOut) "
-                          "should be the same."));
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    clear_float_status,
-    ops::ClearFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
deleted file mode 100644
index 5d8f88cc85f26..0000000000000
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GetFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
-    // GetClearFloatStatus modifies the input.
-    PADDLE_ENFORCE_EQ(float_status_out,
-                      float_status,
-                      platform::errors::PreconditionNotMet(
-                          "The input(FloatStatus) and Output(FloatStatusOut) "
-                          "should be the same."));
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // NPUGetFloatStatus updates data on input in-place.
-    // tmp is only placeholder.
-    NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}).Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    get_float_status,
-    ops::GetFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
deleted file mode 100644
index d4565c1780928..0000000000000
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-
-DECLARE_int32(min_loss_scaling);
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void Update(const platform::NPUDeviceContext& ctx,
-            const std::vector<bool> found_inf_vec,
-            const phi::DenseTensor* pre_loss_scaling_tensor,
-            const phi::DenseTensor* good_in_tensor,
-            const phi::DenseTensor* bad_in_tensor,
-            const int incr_every_n_steps,
-            const int decr_every_n_nan_or_inf,
-            const float incr_ratio,
-            const float decr_ratio,
-            phi::DenseTensor* updated_loss_scaling_tensor,
-            phi::DenseTensor* good_out_tensor,
-            phi::DenseTensor* bad_out_tensor) {
-  auto place = ctx.GetPlace();
-  auto stream = ctx.stream();
-  if (found_inf_vec[0]) {
-    // good_out_data = 0
-    auto g = good_out_tensor->mutable_data<int>(place);
-    platform::NPUMemsetAsync(static_cast<void*>(g),
-                             0,
-                             good_out_tensor->numel() * sizeof(int),
-                             stream);
-    // bad_out_data = bad_in_data + 1
-    phi::DenseTensor factor_tensor(bad_out_tensor->dtype());
-    factor_tensor.mutable_data<int>({1}, place);
-    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    const auto& runner_p2 = NpuOpRunner(
-        "Add", {*bad_in_tensor, factor_tensor}, {*bad_out_tensor}, {});
-    runner_p2.Run(stream);
-
-    std::vector<int> bad_out_data;
-    paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
-    if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
-      const auto& runner_p3 = NpuOpRunner("Power",
-                                          {*pre_loss_scaling_tensor},
-                                          {*updated_loss_scaling_tensor},
-                                          {{"power", static_cast<float>(1)},
-                                           {"scale", decr_ratio},
-                                           {"shift", static_cast<float>(0)}});
-
-      runner_p3.Run(stream);
-
-      std::vector<T> new_loss_scaling;
-      paddle::framework::TensorToVector(
-          *updated_loss_scaling_tensor, ctx, &new_loss_scaling);
-      float min_value = 1.0;
-      if (FLAGS_min_loss_scaling > 1) {
-        min_value = static_cast<float>(FLAGS_min_loss_scaling);
-      }
-
-      if (new_loss_scaling[0] < min_value) {
-        // updated_loss_scaling_data = 1
-        const auto& runner_p4 =
-            NpuOpRunner("Power",
-                        {*pre_loss_scaling_tensor},
-                        {*updated_loss_scaling_tensor},
-                        {{"power", static_cast<float>(1)},
-                         {"scale", static_cast<float>(0)},
-                         {"shift", static_cast<float>(min_value)}});
-
-        runner_p4.Run(stream);
-      }
-
-      // bad_out_data = 0
-      auto b = bad_out_tensor->mutable_data<int>(place);
-      platform::NPUMemsetAsync(static_cast<void*>(b),
-                               0,
-                               bad_out_tensor->numel() * sizeof(int),
-                               stream);
-    }
-  } else {
-    // bad_out_data = 0
-    auto b = bad_out_tensor->mutable_data<int>(place);
-    platform::NPUMemsetAsync(static_cast<void*>(b),
-                             0,
-                             bad_out_tensor->numel() * sizeof(int),
-                             stream);
-
-    // good_out_data = good_in_data + 1
-    phi::DenseTensor factor_tensor(good_out_tensor->dtype());
-    factor_tensor.mutable_data<int>({1}, place);
-    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    const auto& runner_p2 = NpuOpRunner(
-        "Add", {*good_in_tensor, factor_tensor}, {*good_out_tensor}, {});
-    runner_p2.Run(stream);
-
-    std::vector<int> good_out_data;
-    paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data);
-
-    if (good_out_data[0] >= incr_every_n_steps) {
-      const auto& runner_p3 = NpuOpRunner("Power",
-                                          {*pre_loss_scaling_tensor},
-                                          {*updated_loss_scaling_tensor},
-                                          {{"power", static_cast<float>(1)},
-                                           {"scale", incr_ratio},
-                                           {"shift", static_cast<float>(0)}});
-      runner_p3.Run(stream);
-
-      std::vector<T> new_loss_scaling;
-      paddle::framework::TensorToVector(
-          *updated_loss_scaling_tensor, ctx, &new_loss_scaling);
-      if (!std::isfinite(new_loss_scaling[0])) {
-        // updated_loss_scaling_data = pre_loss_scaling_data
-        const auto& runner_p4 = NpuOpRunner("Power",
-                                            {*pre_loss_scaling_tensor},
-                                            {*updated_loss_scaling_tensor},
-                                            {{"power", static_cast<float>(1)},
-                                             {"scale", static_cast<float>(1)},
-                                             {"shift", static_cast<float>(0)}});
-
-        runner_p4.Run(stream);
-      }
-      // good_out_data = 0
-      auto g = good_out_tensor->mutable_data<int>(place);
-      platform::NPUMemsetAsync(static_cast<void*>(g),
-                               0,
-                               good_out_tensor->numel() * sizeof(int),
-                               stream);
-    }
-  }
-}
-
-template <typename T>
-class UpdateLossScalingFunctor {
- public:
-  void operator()(const platform::NPUDeviceContext& dev_ctx,
-                  const std::vector<bool> found_inf_vec,
-                  const phi::DenseTensor* pre_loss_scaling_tensor,
-                  const phi::DenseTensor* good_in_tensor,
-                  const phi::DenseTensor* bad_in_tensor,
-                  const int incr_every_n_steps,
-                  const int decr_every_n_nan_or_inf,
-                  const float incr_ratio,
-                  const float decr_ratio,
-                  phi::DenseTensor* updated_loss_scaling_tensor,
-                  phi::DenseTensor* good_out_tensor,
-                  phi::DenseTensor* bad_out_tensor) const {
-    Update<T>(dev_ctx,
-              found_inf_vec,
-              pre_loss_scaling_tensor,
-              good_in_tensor,
-              bad_in_tensor,
-              incr_every_n_steps,
-              decr_every_n_nan_or_inf,
-              incr_ratio,
-              decr_ratio,
-              updated_loss_scaling_tensor,
-              good_out_tensor,
-              bad_out_tensor);
-  }
-};
-
-template <typename T>
-class LazyZerosNPU {
- public:
-  void operator()(const platform::NPUDeviceContext& dev_ctx,
-                  const std::vector<bool> found_inf_vec,
-                  const std::vector<const phi::DenseTensor*>& xs,
-                  const std::vector<phi::DenseTensor*>& outs) const {
-    if (!xs.size()) {
-      return;
-    }
-    auto place = dev_ctx.GetPlace();
-    auto stream = dev_ctx.stream();
-    phi::DenseTensor* zero_tensor = nullptr;
-    void* zero_ptr = nullptr;
-    if (found_inf_vec[0]) {
-      int max_num = -1;
-      for (size_t i = 0; i < xs.size(); ++i) {
-        auto* out = outs[i];
-        int num = out->numel();
-        if (max_num < num) {
-          max_num = num;
-          zero_tensor = out;
-        }
-      }
-
-      zero_tensor->mutable_data<T>(place);
-      const auto& runner_zeros =
-          NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
-      runner_zeros.Run(stream);
-      zero_tensor->check_memory_size();
-      zero_ptr = zero_tensor->data();
-    }
-
-    for (size_t i = 0; i < xs.size(); ++i) {
-      auto* out = outs[i];
-      auto* x = xs[i];
-      auto dst_ptr = out->mutable_data<T>(place);
-      if (!found_inf_vec[0]) {
-        framework::TensorCopy(*x, place, dev_ctx, out);
-      } else if (zero_ptr != dst_ptr) {
-        auto size = out->numel() * phi::SizeOf(out->dtype());
-        memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    const auto* found_inf = ctx.Input<phi::DenseTensor>("FoundInfinite");
-    PADDLE_ENFORCE_EQ(found_inf->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "FoundInfinite must has only one element."));
-
-    std::vector<bool> found_inf_vec;
-    paddle::framework::TensorToVector(
-        *found_inf, ctx.device_context(), &found_inf_vec);
-
-    LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
-    const bool stop_update = ctx.Attr<bool>("stop_update");
-    if (stop_update) {
-      return;
-    }
-
-    const auto* pre_loss_scaling =
-        ctx.Input<phi::DenseTensor>("PrevLossScaling");
-    const auto* good_in = ctx.Input<phi::DenseTensor>("InGoodSteps");
-    const auto* bad_in = ctx.Input<phi::DenseTensor>("InBadSteps");
-    auto* updated_loss_scaling = ctx.Output<phi::DenseTensor>("LossScaling");
-    auto* good_out = ctx.Output<phi::DenseTensor>("OutGoodSteps");
-    auto* bad_out = ctx.Output<phi::DenseTensor>("OutBadSteps");
-
-    updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
-    good_out->mutable_data<int>(dev_ctx.GetPlace());
-    bad_out->mutable_data<int>(dev_ctx.GetPlace());
-
-    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
-    const int decr_every_n_nan_or_inf =
-        ctx.Attr<int>("decr_every_n_nan_or_inf");
-    const float incr_ratio = ctx.Attr<float>("incr_ratio");
-    const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<MPDType>{}(dev_ctx,
-                                        found_inf_vec,
-                                        pre_loss_scaling,
-                                        good_in,
-                                        bad_in,
-                                        incr_every_n_steps,
-                                        decr_every_n_nan_or_inf,
-                                        incr_ratio,
-                                        decr_ratio,
-                                        updated_loss_scaling,
-                                        good_out,
-                                        bad_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    update_loss_scaling,
-    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
-                                    double>);
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
deleted file mode 100644
index 014fb09474936..0000000000000
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-struct VisitDataArgNPUMaxFunctor {
-  const framework::ExecutionContext& ctx;
-
-  explicit VisitDataArgNPUMaxFunctor(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {}
-  template <typename Tout>
-  void apply() const {
-    auto& x = *(ctx.Input<phi::DenseTensor>("X"));
-    auto& out = *(ctx.Output<phi::DenseTensor>("Out"));
-    out.template mutable_data<Tout>(ctx.GetPlace());
-    auto axis = ctx.Attr<int64_t>("axis");
-    auto dtype = ctx.Attr<int>("dtype");
-    const bool& flatten = ctx.Attr<bool>("flatten");
-
-    phi::DenseTensor transformed_x(x.type());
-    transformed_x.ShareDataWith(x);
-    if (flatten) {
-      transformed_x.Resize(phi::make_ddim({x.numel()}));
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    NpuOpRunner runner;
-    runner.SetType("ArgMaxV2")
-        .AddInput(transformed_x)
-        .AddInput(std::vector<int64_t>{axis})
-        .AddOutput(out)
-        .AddAttrDataType("dtype", dtype)
-        .Run(stream);
-  }
-};
-
-template <typename T>
-class ArgMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dtype = ctx.Attr<int>("dtype");
-    if (dtype < 0) {
-      framework::VisitDataTypeTiny(static_cast<framework::proto::VarType::Type>(
-                                       framework::proto::VarType::INT64),
-                                   VisitDataArgNPUMaxFunctor<T>(ctx));
-      return;
-    }
-    framework::VisitDataTypeTiny(
-        static_cast<framework::proto::VarType::Type>(dtype),
-        VisitDataArgNPUMaxFunctor<T>(ctx));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(arg_max,
-                       ops::ArgMaxNPUKernel<float>,
-                       ops::ArgMaxNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
deleted file mode 100644
index e601efd2d37e1..0000000000000
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/arg_min_max_op_base.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ArgMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    int64_t axis = ctx.Attr<int64_t>("axis");
-    auto dtype = ctx.Attr<int>("dtype");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<int32_t>(ctx.GetPlace());
-
-    NpuOpRunner runner;
-    runner.SetType("ArgMin")
-        .AddInput(*x)
-        .AddInput(std::vector<int64_t>{axis})
-        .AddOutput(*out)
-        .AddAttr("dtype", dtype);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    arg_min,
-    ops::ArgMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ArgMinNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
deleted file mode 100644
index 18915ee4f3d79..0000000000000
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void TranposeNPU(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        std::vector<int64_t>* perm,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<T>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Transpose")
-      .AddInput(in)
-      .AddInput(std::move(*perm))
-      .AddOutput(*out)
-      .Run(stream);
-}
-
-static void CastToInt64(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<int64_t>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_INT64)
-      .Run(stream);
-}
-
-static void CastToFP32(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<float>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT)
-      .Run(stream);
-}
-
-template <typename T>
-class ArgsortNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    framework::NPUAttributeMap attr = {{"axis", -1},
-                                       {"descending", descending}};
-
-    phi::DenseTensor indices_tmp(phi::DataType::INT32);
-    indices_tmp.Resize(indices->dims());
-
-    if (framework::TransToProtoVarType(input->dtype()) ==
-        framework::proto::VarType::INT64) {
-      phi::DenseTensor input_fp32(phi::DataType::FLOAT32);
-      input_fp32.Resize(input->dims());
-      CastToFP32(ctx, stream, *input, &input_fp32);
-
-      phi::DenseTensor output_fp32(phi::DataType::FLOAT32);
-      output_fp32.Resize(output->dims());
-
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output_fp32.mutable_data<float>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr);
-        runner.Run(stream);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input_fp32.type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
-
-        phi::DenseTensor trans_output(input_fp32.type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<float>(ctx, stream, &perm, trans_output, &output_fp32);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      }
-    } else {
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output->mutable_data<T>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
-        runner.Run(stream);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input->type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
-
-        phi::DenseTensor trans_output(input->type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-      }
-    }
-
-    CastToInt64(ctx, stream, indices_tmp, indices);
-  }
-};
-
-template <typename T, typename Type>
-static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          const aclrtStream& stream,
-                          const framework::DDim in_dims,
-                          const phi::DenseTensor& input,
-                          const phi::DenseTensor& indices,
-                          phi::DenseTensor* t_out) {
-  const int64_t input_height =
-      phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-  const int64_t input_width = in_dims[in_dims.size() - 1];
-
-  phi::DenseTensor input_tmp;
-  input_tmp.ShareDataWith(input);
-  input_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  phi::DenseTensor indices_tmp;
-  indices_tmp.ShareDataWith(indices);
-  indices_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
-
-  std::vector<int64_t> indexs_value;
-  for (Type i = 0; i < input_height; i++) {
-    indexs_value.push_back(i * input_width);
-  }
-  phi::DenseTensor indexs_tmp(indices.type());
-  framework::TensorFromVector<int64_t>(
-      indexs_value, ctx.device_context(), &indexs_tmp);
-  indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
-
-  phi::DenseTensor indices_index(indices.type());
-  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
-  const auto& runner_add =
-      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
-  runner_add.Run(stream);
-
-  indices_index.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  t_out->mutable_data<T>(ctx.GetPlace());
-  phi::DenseTensor out_tmp(t_out->type());
-  out_tmp.ShareDataWith(*t_out);
-
-  const auto& runner = NpuOpRunner("TensorScatterUpdate",
-                                   {input_tmp, indices_index, input_tmp},
-                                   {out_tmp},
-                                   {});
-  runner.Run(stream);
-}
-
-template <typename T>
-class ArgsortGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (dO->numel() == 0) return;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
-    } else {
-      std::vector<int64_t> perm;
-      for (int64_t i = 0; i < in_dims.size(); i++) {
-        perm.emplace_back(i);
-      }
-      std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-      std::vector<int64_t> shape;
-      for (size_t i = 0; i < perm.size(); i++) {
-        shape.emplace_back(in_dims[perm[i]]);
-      }
-      auto trans_dims = phi::make_ddim(shape);
-
-      phi::DenseTensor trans_dout(dO->type());
-      phi::DenseTensor trans_ids(indices->type());
-      trans_dout.Resize(trans_dims);
-      trans_ids.Resize(trans_dims);
-
-      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
-      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
-
-      phi::DenseTensor trans_dx(dO->type());
-      trans_dx.Resize(trans_dims);
-      FullAssignNPU<T, int64_t>(
-          ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
-
-      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(argsort,
-                       ops::ArgsortNPUKernel<float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ArgsortNPUKernel<int64_t>,
-#endif
-                       ops::ArgsortNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<float>,
-                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
deleted file mode 100644
index ff88427c12336..0000000000000
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class Variable;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class AssignNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    assign,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
deleted file mode 100644
index 25d8d07802ad1..0000000000000
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(assign);
-USE_OP_DEVICE_KERNEL(assign, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-  init.push_back(static_cast<T>(2.0));
-  init.push_back(static_cast<T>(3.0));
-  init.push_back(static_cast<T>(4.0));
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({4});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  auto op =
-      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
-  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
-  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
-  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
-  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
-}
-
-TEST(assign, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "assign");
-}
diff --git a/paddle/fluid/operators/assign_value_op_npu.cc b/paddle/fluid/operators/assign_value_op_npu.cc
deleted file mode 100644
index 5354f26d6fa73..0000000000000
--- a/paddle/fluid/operators/assign_value_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(assign_value,
-                       ops::AssignValueKernel<bool>,
-                       ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<int64_t>,
-                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
deleted file mode 100644
index 15774d5712fff..0000000000000
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class NPUBatchNormOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-
-    bool test_mode = is_test && (!trainable_stats);
-    bool training = !test_mode && !use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        (x_dims.size() == 4UL || x_dims.size() == 3UL),
-        true,
-        platform::errors::InvalidArgument(
-            "The input tensor X's dimension must equal to 3 or 4. "
-            " But got X's shape = [%s], X's dimension = [%d].",
-            x_dims.to_str(),
-            x_dims.size()));
-
-    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto y_tesnor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    y_tesnor.ShareDataWith(*y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      y_tesnor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (!training) {
-      const auto &runner_infer =
-          NpuOpRunner("BNInfer",
-                      {x_tensor, *scale, *bias, *running_mean, *running_var},
-                      {y_tesnor},
-                      {{"epsilon", epsilon}});
-      runner_infer.Run(stream);
-    } else {
-      auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-      auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-      auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-      auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-      mean_out->mutable_data<float>(ctx.GetPlace());
-      variance_out->mutable_data<float>(ctx.GetPlace());
-      saved_mean->mutable_data<float>(ctx.GetPlace());
-      saved_variance->mutable_data<float>(ctx.GetPlace());
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      phi::DenseTensor sum, square_sum;
-      sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-      square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-
-      // BNTrainingReduce ONLY support rank = 4
-      if (x->dims().size() == 3) {
-        auto x_shape_vec = phi::vectorize(x->dims());
-        if (data_layout == DataLayout::kNCHW) {
-          x_shape_vec.push_back(1);  // expand NCL -> NCL1
-        } else {
-          x_shape_vec.insert(x_shape_vec.begin() + 2, 1);  // expand NLC -> NL1C
-        }
-        auto x_new_shape = phi::make_ddim(x_shape_vec);
-        x_tensor.Resize(x_new_shape);
-        x_tensor.Resize(x_new_shape);
-      }
-      const auto &runner_reduce = NpuOpRunner("BNTrainingReduce",
-                                              {x_tensor},
-                                              {sum, square_sum},
-                                              {{"epsilon", epsilon}});
-      runner_reduce.Run(stream);
-
-      const auto &runner_update = NpuOpRunner(
-          "BNTrainingUpdate",
-          {x_tensor,
-           sum,
-           square_sum,
-           *scale,
-           *bias,
-           *running_mean,
-           *running_var},
-          {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
-          {{"factor", momentum}, {"epsilon", epsilon}});
-      runner_update.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance =
-        ctx.Input<phi::DenseTensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto dy_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    dy_tensor.ShareDataWith(*d_y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      dy_tensor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(scale->dims(), dev_ctx);
-    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(bias->dims(), dev_ctx);
-    if (d_scale == nullptr) {
-      d_scale = &scale_grad_tmp;
-    }
-    if (d_bias == nullptr) {
-      d_bias = &bias_grad_tmp;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<float>(ctx.GetPlace());
-      d_bias->mutable_data<float>(ctx.GetPlace());
-      if (use_global_stats) {
-        const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-        const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *running_mean, *running_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      } else {
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      }
-    }
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_x->dims(), dev_ctx);
-      dx_tensor.ShareDataWith(*d_x);
-      if (data_layout == DataLayout::kNHWC) {
-        dx_tensor.set_layout(DataLayout::kNHWC);
-      }
-      if (use_global_stats) {
-        if (x->dims().size() == 3) {
-          // BNInferGrad only support x rank = 4,
-          auto x_shape_vec = phi::vectorize(d_x->dims());
-          if (data_layout == DataLayout::kNCHW) {
-            x_shape_vec.push_back(1);  // expand NCL -> NCL1
-          } else {
-            x_shape_vec.insert(x_shape_vec.begin() + 2,
-                               1);  // expand NLC -> NL1C
-          }
-          auto x_new_shape = phi::make_ddim(x_shape_vec);
-          dx_tensor.Resize(x_new_shape);
-          dy_tensor.Resize(x_new_shape);
-        }
-        const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_infer =
-            NpuOpRunner("BNInferGrad",
-                        {dy_tensor, *scale, *running_var},
-                        {dx_tensor},
-                        {{"epsilon", epsilon}});
-        runner_infer.Run(stream);
-      } else {
-        const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad",
-                                                {dy_tensor,
-                                                 x_tensor,
-                                                 *d_scale,
-                                                 *d_bias,
-                                                 *scale,
-                                                 *saved_mean,
-                                                 *saved_inv_variance},
-                                                {dx_tensor},
-                                                {{"epsilon", epsilon}});
-        runner_reduce.Run(stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(batch_norm,
-                       ops::NPUBatchNormOpKernel<float>,
-                       ops::NPUBatchNormOpKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(batch_norm_grad,
-                       ops::NPUBatchNormGradOpKernel<float>,
-                       ops::NPUBatchNormGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
deleted file mode 100644
index ed8872d90ef6f..0000000000000
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BCELossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropy",
-                    {*x, *labels},
-                    {*out},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropyGrad",
-                    {*x, *labels, *dout},
-                    {*dx},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc
deleted file mode 100644
index 147d1be226255..0000000000000
--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
-
-namespace ops = paddle::operators;
-using NPUCtx = paddle::platform::NPUDeviceContext;
-
-REGISTER_OP_NPU_KERNEL(beam_search,
-                       ops::BeamSearchOpKernel<float, NPUCtx>,
-                       ops::BeamSearchOpKernel<double, NPUCtx>,
-                       ops::BeamSearchOpKernel<int, NPUCtx>,
-                       ops::BeamSearchOpKernel<int64_t, NPUCtx>);
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
deleted file mode 100644
index 411e112318d12..0000000000000
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-static std::map<framework::proto::VarType::Type, aclDataType>
-    DTYPE_2_ACL_DTYPE = {
-        {framework::proto::VarType::BOOL, ACL_BOOL},
-        {framework::proto::VarType::INT16, ACL_INT16},
-        {framework::proto::VarType::INT32, ACL_INT32},
-        {framework::proto::VarType::INT64, ACL_INT64},
-        {framework::proto::VarType::FP16, ACL_FLOAT16},
-        {framework::proto::VarType::FP32, ACL_FLOAT},
-        {framework::proto::VarType::FP64, ACL_DOUBLE},
-};
-
-template <typename DeviceContext, typename T>
-class CastNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    int dtype = ctx.Attr<int>("out_dtype");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-
-    if (framework::TransToProtoVarType(x->dtype()) == dtype) {
-      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
-      // add special case here.
-      VLOG(4) << "cast to same dtype:" << dtype;
-      out->mutable_data(place, x->type());
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-      return;
-    }
-
-    auto iter = DTYPE_2_ACL_DTYPE.find(
-        static_cast<framework::proto::VarType::Type>(dtype));
-    int aclDtype = iter->second;
-
-    if (dtype == framework::proto::VarType::FP32) {
-      out->mutable_data<float>(place);
-    } else if (dtype == framework::proto::VarType::FP16) {
-      out->mutable_data<paddle::platform::float16>(place);
-    } else if (dtype == framework::proto::VarType::INT16) {
-      out->mutable_data<int16_t>(place);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      out->mutable_data<int32_t>(place);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      out->mutable_data<int64_t>(place);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      out->mutable_data<double>(place);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      out->mutable_data<bool>(place);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    cast,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
deleted file mode 100644
index f22f58d1769ea..0000000000000
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<float>("max_norm");
-    auto in_var = context.InputVar("X");
-
-    if (!(in_var->IsType<phi::DenseTensor>())) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid input variable type, only support LodTensor"
-          "type, but got type is %s.",
-          framework::ToTypeName(in_var->Type())));
-    }
-
-    auto place = context.GetPlace();
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(place);
-
-    PADDLE_ENFORCE_NOT_NULL(input,
-                            platform::errors::InvalidArgument(
-                                "Input(X) of ClipByNormOp should not be null. "
-                                "Please check if it is created correctly."));
-
-    phi::DenseTensor square_sum(input->type());
-    square_sum.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_dims = input->dims();
-    std::vector<int> axis;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      axis.push_back(i);
-    }
-    const auto& square_sum_runner =
-        NpuOpRunner("SquareSumV1",
-                    {*input},
-                    {square_sum},
-                    {{"axis", axis}, {"keep_dims", false}});
-    square_sum_runner.Run(stream);
-
-    phi::DenseTensor x_norm(input->type());
-    x_norm.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
-    x_norm_runner.Run(stream);
-
-    phi::DenseTensor x_norm_t;
-    framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
-    auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
-    if (x_norm_v <= max_norm) {
-      framework::TensorCopy(*input, place, dev_ctx, output);
-    } else {
-      auto epsilon = x_norm_v <= static_cast<float>(1e-30)
-                         ? static_cast<float>(1e-6)
-                         : static_cast<float>(0);
-      float scaling = max_norm / (x_norm_v + epsilon);
-      const auto& muls_runner =
-          NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}});
-      muls_runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    clip_by_norm,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext,
-                             plat::float16>);
diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc
deleted file mode 100644
index 8977bd250e868..0000000000000
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ClipNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    phi::DenseTensor min_tensor_temp(x->type());
-    phi::DenseTensor max_tensor_temp(x->type());
-    if (min_tensor == nullptr) {
-      auto min_value = static_cast<T>(ctx.Attr<float>("min"));
-      min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&min_tensor_temp, min_value);
-      min_tensor = &min_tensor_temp;
-    }
-
-    if (max_tensor == nullptr) {
-      auto max_value = static_cast<T>(ctx.Attr<float>("max"));
-      max_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&max_tensor_temp, max_value);
-      max_tensor = &max_tensor_temp;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ClipGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto* min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto* max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    auto min_val = ctx.Attr<float>("min");
-    if (min_tensor) {
-      phi::DenseTensor min_data;
-      framework::TensorCopy(
-          *min_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &min_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      min_val = static_cast<float>(min_data.data<T>()[0]);
-    }
-
-    auto max_val = ctx.Attr<float>("max");
-    if (max_tensor) {
-      phi::DenseTensor max_data;
-      framework::TensorCopy(
-          *max_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &max_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      max_val = static_cast<float>(max_data.data<T>()[0]);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("HardtanhGrad",
-                    {*x, *dout},
-                    {*dx},
-                    {{"min_val", min_val}, {"max_val", max_val}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    clip,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    clip_grad,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index c20200f6be316..baee3d20daebd 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -84,73 +84,4 @@ if(WITH_ASCEND_CL)
       device_context
       enforce
       executor)
-  cc_test(
-    c_broadcast_op_npu_test
-    SRCS c_broadcast_op_npu_test.cc
-    DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_allreduce_sum_op_npu_test
-    SRCS c_allreduce_sum_op_npu_test.cc
-    DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_reducescatter_op_npu_test
-    SRCS c_reducescatter_op_npu_test.cc
-    DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_allgather_op_npu_test
-    SRCS c_allgather_op_npu_test.cc
-    DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_reduce_sum_op_npu_test
-    SRCS c_reduce_sum_op_npu_test.cc
-    DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_allreduce_max_op_npu_test
-    SRCS c_allreduce_max_op_npu_test.cc
-    DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    send_v2_op_npu_test
-    SRCS send_v2_op_npu_test.cc
-    DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    recv_v2_op_npu_test
-    SRCS recv_v2_op_npu_test.cc
-    DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    checknumeric
-    SRCS checknumeric_npu_test.cc
-    DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-  cc_test(
-    c_sync_comm_stream_op_npu_test
-    SRCS c_sync_comm_stream_op_npu_test.cc
-    DEPS op_registry
-         c_broadcast_op
-         c_comm_init_hccl_op
-         c_sync_comm_stream_op
-         c_gen_hccl_id_op
-         gen_hccl_id_op_helper
-         ${COLLECTIVE_DEPS}
-         ascend_hccl
-         dynamic_loader
-         dynload_warpctc
-         scope
-         device_context
-         enforce
-         executor)
-  cc_test(
-    c_sync_calc_stream_op_npu_test
-    SRCS c_sync_calc_stream_op_npu_test.cc
-    DEPS op_registry
-         elementwise_add_op
-         c_sync_calc_stream_op
-         c_gen_hccl_id_op
-         gen_hccl_id_op_helper
-         ${COLLECTIVE_DEPS}
-         ascend_hccl
-         dynamic_loader
-         dynload_warpctc
-         scope
-         device_context
-         enforce
-         executor)
 endif()
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
deleted file mode 100644
index 296174656f7a1..0000000000000
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_allgather,
-                       ops::CAllGatherOpASCENDKernel<int8_t>,
-                       ops::CAllGatherOpASCENDKernel<int>,
-                       ops::CAllGatherOpASCENDKernel<int64_t>,
-                       ops::CAllGatherOpASCENDKernel<float>,
-                       ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
deleted file mode 100644
index ca4fd7377102d..0000000000000
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allgather, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  int num1 = 1;
-  int num2 = 4;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size() * 2);
-  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-
-TEST(c_allgather, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllGatherOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
deleted file mode 100644
index e7fc35a24e930..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_max,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
deleted file mode 100644
index 65dcfaa711261..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  int num1 = 100;
-  int num2 = 100;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id * 3);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 4.0);
-  }
-}
-
-TEST(c_allreduce_max, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllReduceOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
deleted file mode 100644
index 04beca3765d45..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_min,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
deleted file mode 100644
index 21ae06f57c790..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_prod,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
deleted file mode 100644
index ecc7fc566f68b..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_sum,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
deleted file mode 100644
index cd1d66e0ea0ea..0000000000000
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
-// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
-// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  std::cout << preStr << ":" << std::endl << debugstring;
-  for (auto ele : data) {
-    std::cout << ele << " ";
-  }
-  std::cout << std::endl;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-template <typename T>
-void TestHCCLAllReduceOp(f::Scope* scope,
-                         const p::DeviceContext& ctx,
-                         int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(static_cast<T>(1.0 + rank_id));
-  }
-  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
-  PrintDebugInfo("input data", init);
-
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<T>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  attrs["use_calc_stream"] = 1;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 1; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  float diff = static_cast<float>(out_vec[0]) - 65504;
-  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 1; i < 10; i++) {
-    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
-  }
-}
-
-TEST(c_allreduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  // only support one device, if more than one device, use first default
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-
-  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
-  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
-}
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
deleted file mode 100644
index 8642dfd6088fa..0000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_broadcast,
-                       ops::CBroadcastOpASCENDKernel<int>,
-                       ops::CBroadcastOpASCENDKernel<int8_t>,
-                       ops::CBroadcastOpASCENDKernel<float>,
-                       ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
deleted file mode 100644
index fa6a7374de687..0000000000000
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = 2;
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  for (int64_t i = 0; i < num * num; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["root"] = 0;
-  attrs["ring_id"] = 0;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-}
-
-TEST(c_broadcast, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
deleted file mode 100644
index ef23a8a87e733..0000000000000
--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/collective/c_embedding_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-inline void FillNPU(Tensor *dst,
-                    T val,
-                    const framework::ExecutionContext &context) {
-  Tensor value(dst->type());
-  value.mutable_data<T>({1}, context.GetPlace());
-  FillNpuTensorWithConstant<T>(&value, static_cast<T>(val));
-
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-
-  const auto &runner = NpuOpRunner(
-      "FillD", {value}, {*dst}, {{"dims", phi::vectorize(dst->dims())}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void shard_index(const Tensor &table_t,
-                 const Tensor &ids_t,
-                 int64_t start_idx,
-                 const Tensor &id_t,
-                 const framework::ExecutionContext &context) {
-  const int height = table_t.dims()[0];
-
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  phi::DenseTensor id_t_d;
-  id_t_d.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&id_t_d, static_cast<T>(0.0), context);
-  id_t_d.Resize(ids_t.dims());
-
-  phi::DenseTensor id_t_u;
-  id_t_u.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&id_t_u, static_cast<T>(height - 1), context);
-  id_t_u.Resize(ids_t.dims());
-
-  phi::DenseTensor id_matched_d;
-  id_matched_d.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  phi::DenseTensor id_matched_u;
-  id_matched_u.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  phi::DenseTensor ignore_tensor;
-  ignore_tensor.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&ignore_tensor, static_cast<T>(height), context);
-  ignore_tensor.Resize(ids_t.dims());
-
-  NpuOpRunner sub_runner;
-#if (CANN_VERSION_CODE >= 503003)
-  Tensor factor_tensor(ids_t.type());
-  factor_tensor.mutable_data<T>({1}, context.GetPlace());
-  paddle::framework::TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
-                                      context.device_context(),
-                                      &factor_tensor);
-  sub_runner.SetType("Sub")
-      .AddInput(ids_t)
-      .AddInput(factor_tensor)
-      .AddOutput(id_t);
-#else
-  sub_runner.SetType("Sub")
-      .AddInput(ids_t)
-      .AddInput(std::vector<T>{static_cast<T>(start_idx)})
-      .AddOutput(id_t);
-#endif
-  sub_runner.Run();
-
-  NpuOpRunner lessequal1_runner;
-  lessequal1_runner.SetType("LessEqual")
-      .AddInput(id_t)
-      .AddInput(id_t_u)
-      .AddOutput(id_matched_u);
-  lessequal1_runner.Run();
-
-  NpuOpRunner lessequal2_runner;
-  lessequal2_runner.SetType("LessEqual")
-      .AddInput(id_t_d)
-      .AddInput(id_t)
-      .AddOutput(id_matched_d);
-  lessequal2_runner.Run();
-
-  NpuOpRunner("Equal", {id_matched_u, id_matched_d}, {id_matched_d}, {})
-      .Run(stream);
-  NpuOpRunner("Select", {id_matched_d, id_t, ignore_tensor}, {id_t}, {})
-      .Run(stream);
-}
-
-template <typename TIds, typename T>
-void NPUGetIdsEmbedding(const framework::ExecutionContext &context) {
-  auto *table_t = context.Input<phi::DenseTensor>("W");
-  auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-  auto *output_t = context.Output<phi::DenseTensor>("Out");
-  const int64_t start_idx = context.Attr<int64_t>("start_index");
-
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-
-  phi::DenseTensor ids_t_local;
-  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
-  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
-
-  auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
-  phi::DenseTensor table_t_pad;
-
-  size_t mem_size = table_t->numel() * phi::SizeOf(table_t->dtype());
-  size_t line_mem_size = table_t->dims()[1] * phi::SizeOf(table_t->dtype());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "NPU only accept the second dim must align by 64"));
-
-  VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size
-           << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims();
-
-  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
-      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
-  platform::NPUMemcpyAsync(pad_data,
-                           table_t->data<T>(),
-                           mem_size,
-                           ACL_MEMCPY_DEVICE_TO_DEVICE,
-                           stream,
-                           mem_size);
-  platform::NPUMemsetAsync(
-      pad_data + mem_size, 0, line_mem_size, stream, line_mem_size);
-
-  output_t->mutable_data<T>(context.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("GatherV2")
-      .AddInput(table_t_pad)
-      .AddInput(ids_t_local)
-      .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-      .AddAttrs({{"batch_dims", 0}})
-#endif
-      .AddOutput(*output_t);
-  runner.Run();
-}
-
-template <typename T>
-class CEmbeddingNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-
-    const auto &index_type = framework::TransToProtoVarType(ids_t->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      NPUGetIdsEmbedding<int32_t, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "NPU c_embedding ids only support int32."));
-    }
-  }
-};
-
-template <typename TIds, typename T>
-void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
-  // get inputs
-  const int64_t start_idx = context.Attr<int64_t>("start_index");
-  auto ids_t = context.Input<phi::DenseTensor>("Ids");
-  auto d_output_t =
-      context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-  auto table_t = context.Input<phi::DenseTensor>("W");
-  auto table_grad_t =
-      context.Output<phi::DenseTensor>(framework::GradVarName("W"));
-
-  VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t
-           << ", table_t:" << table_t << ", table_grad_t" << table_grad_t;
-
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-
-  // convert ids_t to local valid
-  phi::DenseTensor ids_t_local;
-  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
-  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
-
-  // padding table_t -> table_t_pad
-  auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
-  phi::DenseTensor table_t_pad;
-
-  // set table_t_pad to zero
-  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
-      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
-  size_t table_t_pad_mem_size =
-      table_t_pad.numel() *
-      framework::SizeOfType(
-          framework::TransToProtoVarType(table_t_pad.dtype()));
-  platform::NPUMemsetAsync(
-      pad_data, 0, table_t_pad_mem_size, stream, table_t_pad_mem_size);
-
-  // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-  // can be different tensor, but in cann 20.2+, it does inplace operation.
-  // Thus, the first input and output should be same tensor.
-  const auto &runner_scatter =
-      NpuOpRunner("ScatterAdd",
-                  {table_t_pad, ids_t_local, *d_output_t},
-                  {table_t_pad},
-                  {{"use_locking", true}});
-  runner_scatter.Run(stream);
-
-  // copy table_t_pad to table_t
-  T *dst = table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
-  const size_t mem_size =
-      table_grad_t->numel() * phi::SizeOf(table_grad_t->dtype());
-
-  // check align
-  size_t line_mem_size =
-      table_grad_t->dims()[1] * phi::SizeOf(table_grad_t->dtype());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "NPU only accept the second dim must align by 64"));
-
-  platform::NPUMemcpyAsync(
-      dst, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream, mem_size);
-}
-
-template <typename T>
-class CEmbeddingGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-
-    const auto &index_type = framework::TransToProtoVarType(ids_t->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      NPUUpdateEmbedding<int32_t, T>(context);
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unavailable("c_embedding ids only support int32."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_embedding,
-                       ops::CEmbeddingNPUKernel<float>,
-                       ops::CEmbeddingNPUKernel<double>,
-                       ops::CEmbeddingNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(c_embedding_grad,
-                       ops::CEmbeddingGradNPUKernel<float>,
-                       ops::CEmbeddingGradNPUKernel<double>,
-                       ops::CEmbeddingGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op_npu.cc b/paddle/fluid/operators/collective/c_identity_op_npu.cc
deleted file mode 100644
index b97743cf14d6f..0000000000000
--- a/paddle/fluid/operators/collective/c_identity_op_npu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_identity_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_identity,
-                       ops::CIdentityOpKernel<float, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<double, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<int, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<int64_t, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<plat::float16, plat::NPUPlace>);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
deleted file mode 100644
index 50d52e0ad1ac8..0000000000000
--- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_reduce_max,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
deleted file mode 100644
index b94da957e8f16..0000000000000
--- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_reduce_min,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
deleted file mode 100644
index 7515ffad25f3e..0000000000000
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_reduce_prod,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
deleted file mode 100644
index 6f056520df20d..0000000000000
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_reduce_sum,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
deleted file mode 100644
index 67831aee39b82..0000000000000
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-
-  std::vector<float> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  int root_id = 0;
-  attrs["root_id"] = root_id;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    if (rank_id == root_id) {
-      EXPECT_EQ(out_vec[i], 3.0);
-    } else {
-      EXPECT_EQ(out_vec[i], init[i]);
-    }
-  }
-}
-
-TEST(c_reduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 2; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, ctx, i);
-  }
-}
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
deleted file mode 100644
index d6bfcd1635a34..0000000000000
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(c_reducescatter,
-                       ops::CReduceScatterOpAscendKernel<int8_t>,
-                       ops::CReduceScatterOpAscendKernel<int>,
-                       ops::CReduceScatterOpAscendKernel<float>,
-                       ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
deleted file mode 100644
index 3adaa8f4c85e6..0000000000000
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<float> init;
-  int num1 = 4;
-  int num2 = 1;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  int iter_num = 10;
-  for (int i = 0; i < iter_num; i++) {
-    op->Run(*scope, place);
-    ctx.Wait();
-  }
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size() / 2);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-
-TEST(c_reducescatter, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLReduceScatterOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
deleted file mode 100644
index abd25fa9e8f61..0000000000000
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  std::vector<T> init_y;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_y.push_back(static_cast<T>(2.0));
-  }
-
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
-  tensor_y->Resize({10, 10});
-
-  f::AttributeMap attrs;
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // sync data
-  auto sync_op0 = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op0->Run(*scope, place);
-
-  // run
-
-  auto op = f::OpRegistry::CreateOp("elementwise_add",
-                                    {{"X", {"X"}}, {"Y", {"Y"}}},
-                                    {{"Out", {"Out"}}},
-                                    attrs);
-
-  op->Run(*scope, place);
-
-  // sync op run
-  auto sync_op = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  // sync op copy
-  auto sync_op2 = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op2->Run(*scope, place);
-
-  float expected = 3.0;
-
-  EXPECT_EQ(out_vec.size(), init_x.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-
-TEST(c_sync_calc_stream, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
deleted file mode 100644
index daac829c32c5a..0000000000000
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_broadcast);
-USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = 2;
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  std::cout << "rank_id:" << rank_id << std::endl;
-  for (int64_t i = 0; i < num * num; ++i) {
-    init.push_back(1.0 + rank_id);
-    std::cout << init[0];
-  }
-  std::cout << std::endl;
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["root"] = 0;
-  attrs["ring_id"] = 0;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  // comm sync
-
-  auto sync_op = f::OpRegistry::CreateOp(
-      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  sync_op->Run(*scope, place);
-
-  // ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-}
-
-TEST(c_sync_comm_stream_op, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
deleted file mode 100644
index 61d51f2857788..0000000000000
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <cmath>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allreduce_sum);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
-DECLARE_string(selected_npus);
-
-template <typename T>
-bool Check(T value, int size = 2 * 512 * 8192) {
-  f::Scope scope;
-  auto x = scope.Var("in");
-  auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
-      p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
-  auto place = ctx.GetPlace();
-
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  tensor_x->Resize({size});
-  tensor_x->mutable_data<T>(place);  // allocate
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < size; ++i) {
-    init.push_back(static_cast<T>(value));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
-  return result;
-}
-
-TEST(check_numeric, NPU) {
-  auto inf = std::numeric_limits<float>::infinity();
-  auto fp16_inf = static_cast<p::float16>(inf);
-  auto nan = NAN;
-  auto fp16_nan = static_cast<p::float16>(nan);
-
-  bool result = false;
-  // Normal
-  VLOG(0) << "start normal";
-  result = Check<p::float16>(static_cast<p::float16>(65546));
-  ASSERT_FALSE(result);
-  Check<float>(static_cast<float>(1.0));
-  ASSERT_FALSE(result);
-
-  // Inf
-  VLOG(0) << "start inf";
-  result = Check<p::float16>(fp16_inf);
-  ASSERT_FALSE(result);
-  result = Check<float>(inf);
-  ASSERT_FALSE(result);
-
-  // Nan
-  VLOG(0) << "start nan";
-  result = Check<p::float16>(fp16_nan);
-  ASSERT_TRUE(result);
-  result = Check<float>(nan);
-  ASSERT_TRUE(result);
-}
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
deleted file mode 100644
index 0054cfa468746..0000000000000
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    mp_allreduce_sum,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
deleted file mode 100644
index 28a4266dcc989..0000000000000
--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "paddle/fluid/operators/collective/partial_allgather_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(partial_allgather,
-                       ops::CallPartialGatherOpASCENDKernel<int8_t>,
-                       ops::CallPartialGatherOpASCENDKernel<int>,
-                       ops::CallPartialGatherOpASCENDKernel<float>,
-                       ops::CallPartialGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/partial_recv_op_npu.cc b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
deleted file mode 100644
index a5c53a7900a20..0000000000000
--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/partial_recv_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(partial_recv,
-                       ops::PartialRecvOpASCENDKernel<int>,
-                       ops::PartialRecvOpASCENDKernel<int8_t>,
-                       ops::PartialRecvOpASCENDKernel<float>,
-                       ops::PartialRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/partial_send_op_npu.cc b/paddle/fluid/operators/collective/partial_send_op_npu.cc
deleted file mode 100644
index 47343148d8ae9..0000000000000
--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(partial_send,
-                       ops::PartialSendOpASCENDKernel<int>,
-                       ops::PartialSendOpASCENDKernel<int8_t>,
-                       ops::PartialSendOpASCENDKernel<float>,
-                       ops::PartialSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
deleted file mode 100644
index 6ea6c12efe319..0000000000000
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/recv_v2_op.h"
-
-#include "paddle/fluid/distributed/collective/process_group.h"
-#include "paddle/phi/api/include/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CRecvOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(recv_v2,
-                       ops::CRecvOpASCENDKernel<int>,
-                       ops::CRecvOpASCENDKernel<int8_t>,
-                       ops::CRecvOpASCENDKernel<float>,
-                       ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
deleted file mode 100644
index ba298342a123e..0000000000000
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/operators/collective/recv_v2_op.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(recv_v2);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(recv_v2, NPU);
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-
-  int num = atoi(getenv("DATA_SIZE"));
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank_id:" << rank_id << std::endl;
-
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Data");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-
-  ctx.Wait();
-
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("SRC_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
-  std::vector<int> out_shape;
-  out_shape.push_back(num);
-  out_shape.push_back(num);
-  attrs["out_shape"] = out_shape;
-
-  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
-  VLOG(3) << "CreateOp recv_v2";
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "Run op recv_v2";
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
-  EXPECT_EQ(out_vec == init, true);
-}
-
-TEST(recv_v2, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHcomRecvOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
deleted file mode 100644
index 9500f4c211a9b..0000000000000
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-
-#include "paddle/fluid/distributed/collective/process_group.h"
-#include "paddle/phi/api/include/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class CSendOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(send_v2,
-                       ops::CSendOpASCENDKernel<int>,
-                       ops::CSendOpASCENDKernel<int8_t>,
-                       ops::CSendOpASCENDKernel<float>,
-                       ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
deleted file mode 100644
index bb39fd5110546..0000000000000
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(send_v2);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(send_v2, NPU);
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = atoi(getenv("DATA_SIZE"));
-
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
-  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank id:" << rank_id;
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  ctx.Wait();
-
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("DEST_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
-
-  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "send run over";
-  ctx.Wait();
-}
-
-TEST(send_v2, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHcomSendOp(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
deleted file mode 100644
index 491d44efa7261..0000000000000
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ConcatNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    auto axis = ctx.Attr<int>("axis");
-
-    if (ctx.HasInput("AxisTensor")) {
-      PADDLE_THROW(platform::errors::NotFound(
-          "The AxisTensor is not supported on NPU now."));
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    std::vector<phi::DenseTensor> inputs;
-    std::vector<std::string> names;
-    for (size_t i = 0; i < ins.size(); ++i) {
-      if (ins[i] && ins[i]->numel() > 0) {
-        inputs.push_back(*ins[i]);
-        names.push_back("x" + std::to_string(i));
-      } else {
-        continue;
-      }
-    }
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*out},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ConcatGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-
-    auto axis = ctx.Attr<int>("axis");
-
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    int offset = 0;
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    for (size_t j = 0; j < outs.size(); ++j) {
-      // For stop gradient
-      // get output tensor that the name is not kEmptyVarName
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        std::vector<int> offsets;
-        std::vector<int> sizes;
-        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
-          if (dim == axis) {
-            offsets.push_back(offset);
-            sizes.push_back(ins[j]->dims()[dim]);
-          } else {
-            offsets.push_back(0);
-            sizes.push_back(ins[j]->dims()[dim]);
-          }
-        }
-        const auto& runner =
-            NpuOpRunner("SliceD",
-                        {*out_grad},
-                        {*outs[j]},
-                        {{"offsets", offsets}, {"size", sizes}});
-        runner.Run(stream);
-      }
-      if (ins[j]->numel() != 0UL) {
-        offset += ins[j]->dims()[axis];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(concat,
-                       ops::ConcatNPUKernel<float>,
-                       ops::ConcatNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatNPUKernel<int64_t>,
-#endif
-                       ops::ConcatNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(concat_grad,
-                       ops::ConcatGradNPUKernel<float>,
-                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatGradNPUKernel<int64_t>,
-#endif
-                       ops::ConcatGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
deleted file mode 100644
index ae6fd8a6fb222..0000000000000
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class EqualNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NotEqualNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("NotEqual", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LessThanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Less", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LessEqualNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<bool>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GreaterThanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<bool>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GreaterEqualNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<bool>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    equal,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, float>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, double>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, int16_t>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, int>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::EqualNPUKernel<plat::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(
-    not_equal,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, float>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, double>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int16_t>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int>,
-    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    less_than,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, float>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, double>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, int16_t>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, int>,
-    ops::LessThanNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    less_equal,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, float>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, double>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int16_t>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int>,
-    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    greater_than,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, float>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, double>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int16_t>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int>,
-    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    greater_equal,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, float>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, double>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int>,
-    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
deleted file mode 100644
index de29f3689cd84..0000000000000
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LogicalNotNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogicalOrNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("LogicalOr", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogicalAndPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("LogicalAnd", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(logical_not,
-                       ops::LogicalNotNPUKernel<plat::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(logical_or,
-                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(logical_and,
-                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
deleted file mode 100644
index 44fb1aa5a1759..0000000000000
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ /dev/null
@@ -1,688 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-static void CastToFP16(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<paddle::platform::float16>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT16)
-      .Run(stream);
-}
-
-static void CastToFP32(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<float>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT)
-      .Run(stream);
-}
-
-template <typename T>
-class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1],
-          input->dims()[input->dims().size() - 1],
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be equal to the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[output->dims().size() - 1],
-              input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1],
-          input->dims()[1],
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be equal to the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1],
-              input->dims()[1]));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    phi::DenseTensor transformed_filter(filter->type());
-    transformed_filter.mutable_data<T>({filter->dims()[1],
-                                        filter->dims()[0],
-                                        filter->dims()[2],
-                                        filter->dims()[3]},
-                                       ctx.device_context().GetPlace());
-    std::vector<int> perm = {1, 0, 2, 3};
-    const auto& runner_trans = NpuOpRunner(
-        "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
-    runner_trans.Run(stream);
-
-    const auto& runner = NpuOpRunner("DepthwiseConv2D",
-                                     {input_tensor, transformed_filter},
-                                     {output_tensor},
-                                     {{"strides", strides},
-                                      {"dilations", dilations},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    phi::DenseTensor transformed_filter(filter->type());
-    transformed_filter.mutable_data<T>({filter->dims()[1],
-                                        filter->dims()[0],
-                                        filter->dims()[2],
-                                        filter->dims()[3]},
-                                       ctx.device_context().GetPlace());
-    std::vector<int> perm = {1, 0, 2, 3};
-    const auto& runner_trans = NpuOpRunner(
-        "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
-    runner_trans.Run(stream);
-
-    // construct NPU attr
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-
-      PADDLE_ENFORCE_EQ(
-          (dilations[2] == 1 && dilations[3] == 1),
-          true,
-          platform::errors::InvalidArgument(
-              "dilation_h and dilation_w in DepthwiseConv2DBackpropFilterD "
-              "must be equal to 1, but got dilation_h %d, dilation_w %d",
-              dilation[2],
-              dilation[3]));
-
-      NpuOpRunner runner;
-      runner.SetType("DepthwiseConv2DBackpropFilterD")
-          .AddInput(input_tensor)
-          .AddInput(output_grad_tensor)
-          .AddOutput(*filter_grad)
-          .AddAttr("filter_size", phi::vectorize(transformed_filter.dims()))
-          .AddAttr("strides", strides)
-          .AddAttr("dilations", dilations)
-          .AddAttr("pads", padding)
-          .AddAttr("data_format", data_format)
-          .Run(stream);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      NpuOpRunner runner;
-      runner.SetType("DepthwiseConv2DBackpropInputD")
-          .AddInput(transformed_filter)
-          .AddInput(output_grad_tensor)
-          .AddOutput(input_grad_tensor)
-          .AddAttr("input_size", phi::vectorize(input->dims()))
-          .AddAttr("strides", strides)
-          .AddAttr("dilations", dilations)
-          .AddAttr("pads", padding)
-          .AddAttr("data_format", data_format)
-          .Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv2D",
-                                     {input_tensor, *filter},
-                                     {output_tensor},
-                                     {{"strides", strides_vec},
-                                      {"pads", paddings},
-                                      {"dilations", dilations_vec},
-                                      {"groups", groups},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class NPUConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
-
-      phi::DenseTensor filter_grad_fp32(phi::DataType::FLOAT32);
-      filter_grad_fp32.Resize(filter_grad->dims());
-
-      if (framework::TransToProtoVarType(input->dtype()) ==
-          framework::proto::VarType::FP16) {
-        CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32);
-      } else {
-        filter_grad_fp32.ShareDataWith(*filter_grad);
-      }
-
-      const auto& runner = NpuOpRunner("Conv2DBackpropFilterD",
-                                       {input_tensor, output_grad_tensor},
-                                       {filter_grad_fp32},
-                                       {{"filter_size", filter_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-
-      if (framework::TransToProtoVarType(input->dtype()) ==
-          framework::proto::VarType::FP16) {
-        CastToFP16(ctx, stream, filter_grad_fp32, filter_grad);
-      }
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
-
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      const auto& runner = NpuOpRunner("Conv2DBackpropInputD",
-                                       {*filter, output_grad_tensor},
-                                       {input_grad_tensor},
-                                       {{"input_size", input_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUConv3dKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    PADDLE_ENFORCE_EQ(data_format,
-                      "NCDHW",
-                      platform::errors::Unimplemented(
-                          "the data_format must be NCDHW in "
-                          "the npu kernel of conv3d, but got data_format "
-                          "= [%s]",
-                          data_format));
-
-    PADDLE_ENFORCE_EQ(groups,
-                      1,
-                      platform::errors::Unimplemented(
-                          "the groups must be 1 in "
-                          "the npu kernel of conv3d, but got groups "
-                          "= [%d]",
-                          groups));
-
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto input_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
-    auto filter_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
-    auto output_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(output->dims(), dev_ctx);
-
-    input_tensor.ShareDataWith(*input);
-    filter_tensor.ShareDataWith(*filter);
-    output_tensor.ShareDataWith(*output);
-
-    input_tensor.set_layout(DataLayout::kNCDHW);
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-    output_tensor.set_layout(DataLayout::kNCDHW);
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(5, 1);
-    std::vector<int> dilations_vec(5, 1);
-
-    strides_vec[2] = strides[0];
-    strides_vec[3] = strides[1];
-    strides_vec[4] = strides[2];
-    dilations_vec[2] = dilations[0];
-    dilations_vec[3] = dilations[1];
-    dilations_vec[4] = dilations[2];
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv3D",
-                                     {input_tensor, filter_tensor},
-                                     {output_tensor},
-                                     {{"strides", strides_vec},
-                                      {"pads", paddings},
-                                      {"dilations", dilations_vec},
-                                      {"groups", groups},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class NPUConv3dGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    PADDLE_ENFORCE_EQ(data_format,
-                      "NCDHW",
-                      platform::errors::Unimplemented(
-                          "the data_format must be NCDHW in "
-                          "the npu kernel of conv3d, but got data_format "
-                          "= [%s]",
-                          data_format));
-
-    PADDLE_ENFORCE_EQ(groups,
-                      1,
-                      platform::errors::Unimplemented(
-                          "the groups must be 1 in "
-                          "the npu kernel of conv3d, but got groups "
-                          "= [%d]",
-                          groups));
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto input_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
-    auto filter_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
-    auto output_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-        output_grad->dims(), dev_ctx);
-
-    input_tensor.ShareDataWith(*input);
-    filter_tensor.ShareDataWith(*filter);
-    output_grad_tensor.ShareDataWith(*output_grad);
-
-    input_tensor.set_layout(DataLayout::kNCDHW);
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-    output_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(5, 1);
-    std::vector<int> dilations_vec(5, 1);
-
-    strides_vec[2] = strides[0];
-    strides_vec[3] = strides[1];
-    strides_vec[4] = strides[2];
-    dilations_vec[2] = dilations[0];
-    dilations_vec[3] = dilations[1];
-    dilations_vec[4] = dilations[2];
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
-
-      phi::DenseTensor filter_grad_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter_grad->dims(),
-                                                     dev_ctx);
-      filter_grad_tensor.ShareDataWith(*filter_grad);
-      filter_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-      const auto& runner = NpuOpRunner("Conv3DBackpropFilterD",
-                                       {input_tensor, output_grad_tensor},
-                                       {filter_grad_tensor},
-                                       {{"filter_size", filter_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
-
-      phi::DenseTensor input_grad_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(input_grad->dims(),
-                                                     dev_ctx);
-      input_grad_tensor.ShareDataWith(*input_grad);
-      input_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-      const auto& runner = NpuOpRunner("Conv3DBackpropInputD",
-                                       {filter_tensor, output_grad_tensor},
-                                       {input_grad_tensor},
-                                       {{"input_size", input_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(depthwise_conv2d,
-                       ops::DepthwiseConvNPUKernel<float>,
-                       ops::DepthwiseConvNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(depthwise_conv2d_grad,
-                       ops::DepthwiseConvGradNPUKernel<float>,
-                       ops::DepthwiseConvGradNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d,
-                       ops::NPUConvOpKernel<float>,
-                       ops::NPUConvOpKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d_grad,
-                       ops::NPUConvGradOpKernel<float>,
-                       ops::NPUConvGradOpKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d,
-                       ops::NPUConv3dKernel<float>,
-                       ops::NPUConv3dKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d_grad,
-                       ops::NPUConv3dGradKernel<float>,
-                       ops::NPUConv3dGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
deleted file mode 100644
index f9da50848df2a..0000000000000
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    for (auto i = output_padding.size(); i < 4; ++i) {
-      output_padding.insert(output_padding.begin(), 0);
-    }
-    auto output_dim_vec = phi::vectorize(output_tensor.dims());
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv2DTransposeD",
-                                     {input_tensor, *filter},
-                                     {output_tensor},
-                                     {{"input_size", output_dim_vec},
-                                      {"strides", strides},
-                                      {"dilations", dilations},
-                                      {"output_padding", output_padding},
-                                      {"groups", groups},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    // auto out_grad_dims = output_grad->dims();
-    // const int batch_size = static_cast<int>(input->dims()[0]);
-
-    const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
-
-    framework::DDim in_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Conv2DBackpropFilterD",
-                      {output_grad_tensor, input_tensor},
-                      {*filter_grad},
-                      {{"filter_size", phi::vectorize<int>(filter_dims)},
-                       {"strides", strides_vec},
-                       {"pads", paddings},
-                       {"dilations", dilations_vec},
-                       {"groups", groups},
-                       {"data_format", data_format}});
-      runner.Run(stream);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      const auto& runner = NpuOpRunner("Conv2D",
-                                       {output_grad_tensor, *filter},
-                                       {input_grad_tensor},
-                                       {{"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    if (data_format == "NHWC") {
-      data_format = "NDHWC";
-    } else {
-      data_format = "NCDHW";
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(5, 1);
-    std::vector<int> dilations(5, 1);
-
-    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
-    input_tensor.Resize(input->dims());
-    input_tensor.ShareDataWith(*input);
-    output_tensor.Resize(output->dims());
-    output_tensor.ShareDataWith(*output);
-    filter_tensor.Resize(filter->dims());
-    filter_tensor.ShareDataWith(*filter);
-
-    PADDLE_ENFORCE_EQ(
-        dilation[0],
-        1,
-        platform::errors::InvalidArgument(
-            "dilation[0] must be equal 1, but received %d.", dilation[0]));
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNDHWC);
-      output_tensor.set_layout(DataLayout::kNDHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      strides[3] = stride[2];
-      dilations[2] = dilation[1];
-      dilations[3] = dilation[2];
-    } else {
-      input_tensor.set_layout(DataLayout::kNCDHW);
-      output_tensor.set_layout(DataLayout::kNCDHW);
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      strides[4] = stride[2];
-      dilations[3] = dilation[1];
-      dilations[4] = dilation[2];
-    }
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-
-    auto output_dim_vec = phi::vectorize<int32_t>(output_tensor.dims());
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-
-    NpuOpRunner runner;
-    runner.SetType("Conv3DBackpropInputD")
-        .AddInput(filter_tensor)
-        .AddInput(input_tensor)
-        .AddAttr("input_size", output_dim_vec)
-        .AddAttr("strides", strides)
-        .AddAttr("pads", padding)
-        .AddAttr("dilations", dilations)
-        .AddAttr("groups", groups)
-        .AddAttr("data_format", data_format)
-        .AddOutput(output_tensor);
-    runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose,
-                       ops::Conv2DTransposeNPUKernel<float>,
-                       ops::Conv2DTransposeNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad,
-                       ops::Conv2DTransposeGradNPUKernel<float>,
-                       ops::Conv2DTransposeGradNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d_transpose,
-                       ops::Conv3DTransposeNPUKernel<float>,
-                       ops::Conv3DTransposeNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
deleted file mode 100644
index 5aaa832ce3383..0000000000000
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CropNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    std::vector<int> offset_list;
-    if (ctx.HasInput("Offsets")) {
-      auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
-      paddle::framework::TensorToVector(
-          *offsets_tensor, ctx.device_context(), &offset_list);
-      if (offset_list.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      }
-    } else {
-      auto res = ctx.Attr<std::vector<int>>("offsets");
-      if (res.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      } else {
-        offset_list.insert(offset_list.end(), res.begin(), res.end());
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(offset_list.size()),
-        x->dims().size(),
-        platform::errors::InvalidArgument(
-            "The shape (%d) of CropOp's "
-            "'offset' attribute should be equal to the shape of dims "
-            "(%d) of the Input(X).",
-            offset_list.size(),
-            x->dims().size()));
-
-    int axis_int = 0;
-    framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
-                                             {"axis", axis_int}};
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    if (ctx.HasInput("Y")) {
-      auto* shape = ctx.Input<phi::DenseTensor>("Y");
-      PADDLE_ENFORCE_EQ(shape->dims().size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape->dims().size(),
-                            x->dims().size()));
-
-      // shape memory maybe have gc.
-      phi::DenseTensor tmp_shape(*shape);
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    } else {
-      auto shape_size = ctx.Attr<std::vector<int>>("shape");
-      PADDLE_ENFORCE_EQ(shape_size.size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape_size.size(),
-                            x->dims().size()));
-      phi::DenseTensor tmp_shape(x->dtype());
-      tmp_shape.Resize(phi::make_ddim(shape_size));
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    crop,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
deleted file mode 100644
index a5c77922054da..0000000000000
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-static void CumsumImp(const phi::DenseTensor& input,
-                      phi::DenseTensor* output,
-                      const framework::NPUAttributeMap& attr_input,
-                      const framework::ExecutionContext& ctx) {
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  if (framework::TransToProtoVarType(input.dtype()) ==
-      framework::proto::VarType::INT64) {
-    phi::DenseTensor tmp_input;
-    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
-    auto dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
-    const auto& cast_runner_1 =
-        NpuOpRunner("Cast",
-                    {input},
-                    {tmp_input},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_1.Run(stream);
-
-    phi::DenseTensor tmp_output;
-    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
-    runner.Run(stream);
-
-    dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(output->type()));
-    const auto& cast_runner_2 =
-        NpuOpRunner("Cast",
-                    {tmp_output},
-                    {*output},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_2.Run(stream);
-  } else {
-    const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
-    runner.Run(stream);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool reverse = ctx.Attr<bool>("reverse");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {
-        {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
-
-    bool flatten = ctx.Attr<bool>("flatten");
-    if (flatten) {
-      PADDLE_ENFORCE_EQ(
-          axis,
-          -1,
-          platform::errors::InvalidArgument(
-              "when flatten is true, attr axis must be default %d, but got %d",
-              -1,
-              axis));
-
-      phi::DenseTensor new_x(x->type());
-      new_x.ShareDataWith(*x);
-
-      new_x.Resize(phi::make_ddim({x->numel()}));
-
-      CumsumImp(new_x, out, attr_input, ctx);
-    } else {
-      CumsumImp(*x, out, attr_input, ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    cumsum,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc
deleted file mode 100644
index 4170088faff18..0000000000000
--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/impl/box_coder.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct BoxCoderFunction {
- public:
-  explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    phi::DenseTensor z;
-    z.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
-    runner.Run(stream);
-    return z;
-  }
-  phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    z.mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
-    runner.Run(stream);
-    return z;
-  }
-  void DivWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    DivWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  void MulWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    MulWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  void AddWithBroadCastVoid(const phi::DenseTensor& x,
-                            const phi::DenseTensor& y,
-                            const framework::DDim& shape,
-                            phi::DenseTensor* z) {
-    z->mutable_data<T>(shape, place);
-    const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
-    runner.Run(stream);
-  }
-  phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x,
-                                    const phi::DenseTensor& y,
-                                    const framework::DDim& shape) {
-    phi::DenseTensor z;
-    AddWithBroadCastVoid(x, y, shape, &z);
-    return z;
-  }
-  phi::DenseTensor Abs(const phi::DenseTensor& x) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Log(const phi::DenseTensor& x) {
-    phi::DenseTensor t_x_m1 = Adds(x, -1);
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Exp(const phi::DenseTensor& x) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(x.dims(), place);
-    const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
-    runner.Run(stream);
-    return y;
-  }
-  phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    auto dim_x = x.dims();
-    auto dim_y = y.dims();
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "x should be a 2-dim tensor, but got %d-dim.", dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_y.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "y should be a 2-dim tensor, but got %d-dim.", dim_y.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_x[1],
-        dim_y[0],
-        platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but "
-                                          "got dim_x[1] = %d, dim_y[0] = %d.",
-                                          dim_x[1],
-                                          dim_y[0]));
-    phi::DenseTensor z;
-    z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
-    const auto& runner =
-        NpuOpRunner("MatMul",
-                    {x, y},
-                    {z},
-                    {{"transpose_x1", false}, {"transpose_x2", false}});
-    runner.Run(stream);
-    return z;
-  }
-  void ConcatVoid(const std::vector<phi::DenseTensor>& inputs,
-                  const framework::DDim& shape_out,
-                  int axis,
-                  phi::DenseTensor* output) {
-    output->mutable_data<T>(shape_out, place);
-    std::vector<std::string> names;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      names.push_back("x" + std::to_string(i));
-    }
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*output},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-  phi::DenseTensor Concat(const std::vector<phi::DenseTensor>& inputs,
-                          const framework::DDim& shape_out,
-                          int axis) {
-    phi::DenseTensor output;
-    ConcatVoid(inputs, shape_out, axis, &output);
-    return output;
-  }
-  phi::DenseTensor Slice(const phi::DenseTensor& x,
-                         const std::vector<int>& offsets,
-                         const std::vector<int>& size,
-                         const framework::DDim& shape) {
-    phi::DenseTensor y;
-    y.mutable_data<T>(shape, place);
-    const auto& runner =
-        NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
-    runner.Run(stream);
-    return y;
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-void Vector2Tensor(const framework::ExecutionContext& ctx,
-                   const std::vector<T>& vec,
-                   const framework::DDim& ddim,
-                   phi::DenseTensor* tsr) {
-  framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
-  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  tsr->Resize(ddim);
-}
-
-template <typename T>
-void BoxCoderEnc(const framework::ExecutionContext& ctx,
-                 const phi::DenseTensor* tb,
-                 const phi::DenseTensor* pb,
-                 const phi::DenseTensor* pbv,
-                 const bool norm,
-                 const std::vector<float>& variance,
-                 phi::DenseTensor* out) {
-  auto M = pb->dims()[0];
-  auto N = tb->dims()[0];
-  auto shape_0 = phi::make_ddim({4, 2});
-  phi::DenseTensor m_diff;
-  phi::DenseTensor m_aver;
-  std::vector<T> vec_diff = {static_cast<T>(-1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(-1),
-                             static_cast<T>(1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(1)};
-  std::vector<T> vec_aver = {static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5)};
-  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
-  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
-
-  BoxCoderFunction<T> F(ctx);
-  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  phi::DenseTensor tb_xy = F.Dot(*tb, m_aver);
-  phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
-
-  pb_xy.Resize({1, M, 2});
-  pb_wh.Resize({1, M, 2});
-  tb_xy.Resize({N, 1, 2});
-  tb_wh.Resize({N, 1, 2});
-
-  auto shape_half = phi::make_ddim({N, M, 2});
-  auto shape_full = phi::make_ddim({N, M, 4});
-
-  phi::DenseTensor out_xy_0 = F.DivWithBroadCast(
-      F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
-  phi::DenseTensor out_wh_0 =
-      F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
-  phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
-
-  if (pbv) {
-    F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
-  } else {
-    phi::DenseTensor t_var;
-    std::vector<T> vec_var(4);
-    for (auto i = 0; i < 4; i++) {
-      vec_var[i] = static_cast<T>(variance[i]);
-    }
-    Vector2Tensor(ctx, vec_var, phi::make_ddim({1, 1, 4}), &t_var);
-    F.DivWithBroadCastVoid(out_0, t_var, shape_full, out);
-  }
-}
-
-template <typename T>
-void BoxCoderDec(const framework::ExecutionContext& ctx,
-                 const phi::DenseTensor* tb,
-                 const phi::DenseTensor* pb,
-                 const phi::DenseTensor* pbv,
-                 const bool norm,
-                 const std::vector<float>& variance,
-                 int axis,
-                 phi::DenseTensor* out) {
-  auto shape_0 = phi::make_ddim({4, 2});
-  phi::DenseTensor m_diff;
-  phi::DenseTensor m_aver;
-  std::vector<T> vec_diff = {static_cast<T>(-1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(-1),
-                             static_cast<T>(1),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(1)};
-  std::vector<T> vec_aver = {static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0.5),
-                             static_cast<T>(0),
-                             static_cast<T>(0),
-                             static_cast<T>(0.5)};
-  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
-  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
-
-  BoxCoderFunction<T> F(ctx);
-  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2})
-                                   : phi::make_ddim({pb->dims()[0], 1, 2});
-  pb_xy.Resize(pb_resize_shape);
-  pb_wh.Resize(pb_resize_shape);
-
-  auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2});
-  std::vector<int> tbox_slice_size = {
-      static_cast<int>(tb->dims()[0]), static_cast<int>(tb->dims()[1]), 2};
-  phi::DenseTensor tbox01 =
-      F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
-  phi::DenseTensor tbox23 =
-      F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
-
-  phi::DenseTensor tb_xy;
-  phi::DenseTensor tb_wh;
-  if (pbv) {
-    auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2});
-    auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2})
-                                       : phi::make_ddim({pbv->dims()[0], 1, 2});
-    std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
-    phi::DenseTensor pbv_t01 =
-        F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
-    phi::DenseTensor pbv_t23 =
-        F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
-    pbv_t01.Resize(pbvt_resize_shape);
-    pbv_t23.Resize(pbvt_resize_shape);
-
-    F.AddWithBroadCastVoid(
-        F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape),
-        pb_xy,
-        tbox_slice_shape,
-        &tb_xy);
-    F.MulWithBroadCastVoid(
-        F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)),
-        pb_wh,
-        tbox_slice_shape,
-        &tb_wh);
-  } else if (variance.empty()) {
-    F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape),
-                           pb_xy,
-                           tbox_slice_shape,
-                           &tb_xy);
-    F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
-  } else {
-    phi::DenseTensor t_var01, t_var23;
-    auto t_var_shape = phi::make_ddim({1, 1, 2});
-    std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
-                                static_cast<T>(variance[1])};
-    std::vector<T> vec_var23 = {static_cast<T>(variance[2]),
-                                static_cast<T>(variance[3])};
-    Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01);
-    Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23);
-    F.AddWithBroadCastVoid(
-        F.MulWithBroadCast(tbox01,
-                           F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape),
-                           tbox_slice_shape),
-        pb_xy,
-        tbox_slice_shape,
-        &tb_xy);
-    F.MulWithBroadCastVoid(
-        F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)),
-        pb_wh,
-        tbox_slice_shape,
-        &tb_wh);
-  }
-  phi::DenseTensor obox01 =
-      F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
-  phi::DenseTensor obox23 =
-      F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
-             (norm ? 0 : -1));
-  F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
-}
-
-template <typename T>
-class BoxCoderNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* prior_box = ctx.Input<phi::DenseTensor>("PriorBox");
-    auto* prior_box_var = ctx.Input<phi::DenseTensor>("PriorBoxVar");
-    auto* target_box = ctx.Input<phi::DenseTensor>("TargetBox");
-    auto* output_box = ctx.Output<phi::DenseTensor>("OutputBox");
-    std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
-    const int axis = ctx.Attr<int>("axis");
-
-    if (prior_box_var) {
-      PADDLE_ENFORCE_EQ(variance.empty(),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Input 'PriorBoxVar' and attribute 'variance'"
-                            " of BoxCoder operator should not be used at the "
-                            "same time."));
-    }
-    if (!(variance.empty())) {
-      PADDLE_ENFORCE_EQ(static_cast<int>(variance.size()),
-                        4,
-                        platform::errors::InvalidArgument(
-                            "Size of attribute 'variance' in BoxCoder operator"
-                            " should be 4. But received size is %d",
-                            variance.size()));
-    }
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input 'TargetBox' of BoxCoder operator only"
-                            " supports LoD with one level."));
-    }
-
-    auto code_type =
-        phi::funcs::GetBoxCodeType(ctx.Attr<std::string>("code_type"));
-    bool normalized = ctx.Attr<bool>("box_normalized");
-
-    if (code_type == phi::funcs::BoxCodeType::kEncodeCenterSize) {
-      BoxCoderEnc<T>(ctx,
-                     target_box,
-                     prior_box,
-                     prior_box_var,
-                     normalized,
-                     variance,
-                     output_box);
-    } else {
-      BoxCoderDec<T>(ctx,
-                     target_box,
-                     prior_box,
-                     prior_box_var,
-                     normalized,
-                     variance,
-                     axis,
-                     output_box);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(box_coder,
-                       ops::BoxCoderNPUKernel<float>,
-                       ops::BoxCoderNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
deleted file mode 100644
index c9935e54d82ef..0000000000000
--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/density_prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-using fp16 = paddle::platform::float16;
-
-template <typename T>
-struct DensityPriorBoxFunction {
- public:
-  explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<platform::NPUDeviceContext>().stream();
-    t0.mutable_data<float>({1}, place);
-    t1.mutable_data<float>({1}, place);
-    tn.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
-    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
-  }
-  void Arange(int n, phi::DenseTensor* x) {
-    //  x should be init first
-    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(y->type()));
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Concat(const std::vector<phi::DenseTensor>& inputs,
-              int axis,
-              phi::DenseTensor* output) {
-    //  output should be init first
-    std::vector<std::string> names;
-    for (size_t i = 0; i < inputs.size(); i++) {
-      names.push_back("x" + std::to_string(i));
-    }
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*output},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-  void Tile(const phi::DenseTensor* x,
-            phi::DenseTensor* y,
-            const std::vector<int>& multiples) {
-    //  y should be init first
-    if (x->dims() == y->dims()) {
-      framework::TensorCopy(
-          *x,
-          place,
-          ctx.template device_context<platform::NPUDeviceContext>(),
-          y);
-      return;
-    }
-    const auto& runner =
-        NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}});
-    runner.Run(stream);
-  }
-  void FloatVec2Tsr(const std::vector<float>& vec, phi::DenseTensor* tsr_dst) {
-    //
-    framework::TensorFromVector<T>(vec, ctx.device_context(), tsr_dst);
-    ctx.template device_context<platform::NPUDeviceContext>().Wait();
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-  phi::DenseTensor t0;
-  phi::DenseTensor t1;
-  phi::DenseTensor tn;
-};
-
-template <>
-void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  phi::DenseTensor x_fp32(phi::DataType::FLOAT32);
-  x_fp32.mutable_data<float>(x->dims(), place);
-  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
-  runner.Run(stream);
-  Cast(&x_fp32, x);
-}
-
-template <>
-void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
-                                                 phi::DenseTensor* tsr_dst) {
-  phi::DenseTensor tsr_fp32(phi::DataType::FLOAT32);
-  tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
-  framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
-  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  Cast(&tsr_fp32, tsr_dst);
-}
-
-template <typename T>
-class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* vars = ctx.Output<phi::DenseTensor>("Variances");
-
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto clip = ctx.Attr<bool>("clip");
-
-    auto fixed_sizes = ctx.Attr<std::vector<float>>("fixed_sizes");
-    auto fixed_ratios = ctx.Attr<std::vector<float>>("fixed_ratios");
-    auto densities = ctx.Attr<std::vector<int>>("densities");
-
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-
-    int image_w = image->dims()[3];
-    int image_h = image->dims()[2];
-    int layer_w = input->dims()[3];
-    int layer_h = input->dims()[2];
-
-    auto _type = input->dtype();
-    auto place = ctx.GetPlace();
-    DensityPriorBoxFunction<T> F(ctx);
-
-    phi::DenseTensor h(_type);
-    h.mutable_data<T>({layer_h}, place);
-    phi::DenseTensor w(_type);
-    w.mutable_data<T>({layer_w}, place);
-    F.Arange(layer_h, &h);
-    F.Arange(layer_w, &w);
-    h.Resize({layer_h, 1, 1, 1});
-    w.Resize({1, layer_w, 1, 1});
-
-    step_w = step_w > 0 ? step_w : static_cast<float>(image_w) / layer_w;
-    step_h = step_h > 0 ? step_h : static_cast<float>(image_h) / layer_h;
-    int step_average = static_cast<int>((step_w + step_h) * 0.5);
-
-    int ratios_size = fixed_ratios.size();
-    int num_priors_per_ratio = 0;
-    for (size_t i = 0; i < densities.size(); ++i) {
-      num_priors_per_ratio += densities[i] * densities[i];
-    }
-    phi::DenseTensor di(_type);
-    phi::DenseTensor dj(_type);
-    phi::DenseTensor shifts(_type);
-    phi::DenseTensor box_w_ratio(_type);
-    phi::DenseTensor box_h_ratio(_type);
-    di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    box_w_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-    box_h_ratio.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
-
-    int64_t start = 0;
-    std::vector<int> vec_tile = {0, 0, 0};
-    for (size_t i = 0; i < densities.size(); ++i) {
-      //  Range = start:start+ratios_size*density_sqr, density = densities[i]
-      int density_sqr = densities[i] * densities[i];
-      //  shifts[Range] = [step_average/density]*ratios_size*density_sqr
-      phi::DenseTensor shifts_part =
-          shifts.Slice(start, start + ratios_size * density_sqr);
-      FillNpuTensorWithConstant<T>(&shifts_part,
-                                   static_cast<T>(step_average / densities[i]));
-
-      //  di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
-      //  dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
-      phi::DenseTensor di_part =
-          di.Slice(start, start + ratios_size * density_sqr);
-      phi::DenseTensor dj_part =
-          dj.Slice(start, start + ratios_size * density_sqr);
-      if (densities[i] > 1) {
-        di_part.Resize({ratios_size, densities[i], densities[i]});
-        dj_part.Resize({ratios_size, densities[i], densities[i]});
-        phi::DenseTensor range_n(_type);
-        range_n.mutable_data<T>({densities[i]}, place);
-        F.Arange(densities[i], &range_n);
-        range_n.Resize({1, densities[i], 1});
-        vec_tile[0] = ratios_size;
-        vec_tile[1] = 1;
-        vec_tile[2] = densities[i];
-        F.Tile(&range_n, &di_part, vec_tile);
-        range_n.Resize({1, 1, densities[i]});
-        vec_tile[1] = densities[i];
-        vec_tile[2] = 1;
-        F.Tile(&range_n, &dj_part, vec_tile);
-      } else {
-        FillNpuTensorWithConstant<T>(&di_part, static_cast<T>(0));
-        FillNpuTensorWithConstant<T>(&dj_part, static_cast<T>(0));
-      }
-
-      int start_box_ratio = start;
-      for (float ar : fixed_ratios) {
-        //  Range_mini = start_box_ratio:start_box_ratio+density_sqr
-        //  box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)]  * density_sqr
-        //  box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)]  * density_sqr
-        phi::DenseTensor box_h_ratio_part =
-            box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        phi::DenseTensor box_w_ratio_part =
-            box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        FillNpuTensorWithConstant<T>(&box_w_ratio_part,
-                                     static_cast<T>(fixed_sizes[i] * sqrt(ar)));
-        FillNpuTensorWithConstant<T>(&box_h_ratio_part,
-                                     static_cast<T>(fixed_sizes[i] / sqrt(ar)));
-        start_box_ratio += density_sqr;
-      }
-      start = start_box_ratio;
-    }
-    di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-    box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1});
-
-    //  c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
-    //  c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
-    phi::DenseTensor c_x(_type);
-    phi::DenseTensor c_y(_type);
-    auto dim0 =
-        phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1});
-    auto dim1 =
-        phi::make_ddim({layer_h, 1, ratios_size * num_priors_per_ratio, 1});
-    c_x.mutable_data<T>(dim0, place);
-    c_y.mutable_data<T>(dim1, place);
-    F.Adds(&w, offset, &w);
-    F.Muls(&w, step_w, &w);
-    F.Adds(&w, static_cast<float>(-step_average) * static_cast<float>(0.5), &w);
-    F.Adds(&h, offset, &h);
-    F.Muls(&h, step_h, &h);
-    F.Adds(&h, static_cast<float>(-step_average) * static_cast<float>(0.5), &h);
-    F.Mul(&di, &shifts, &di);
-    F.Mul(&dj, &shifts, &dj);
-    F.Muls(&shifts, static_cast<float>(0.5), &shifts);
-    F.Add(&di, &shifts, &di);
-    F.Add(&dj, &shifts, &dj);
-    F.Add(&dj, &w, &c_x);
-    F.Add(&di, &h, &c_y);
-
-    //  box_w_ratio = box_w_ratio / 2
-    //  box_h_ratio = box_h_ratio / 2
-    F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
-    F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);
-
-    phi::DenseTensor zero_t(_type);
-    phi::DenseTensor one_t(_type);
-    zero_t.mutable_data<T>({1}, place);
-    one_t.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
-    FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
-
-    phi::DenseTensor outbox0(_type);
-    phi::DenseTensor outbox1(_type);
-    phi::DenseTensor outbox2(_type);
-    phi::DenseTensor outbox3(_type);
-    outbox0.mutable_data<T>(dim0, place);
-    outbox1.mutable_data<T>(dim1, place);
-    outbox2.mutable_data<T>(dim0, place);
-    outbox3.mutable_data<T>(dim1, place);
-
-    //  outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 )
-    //  outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 )
-    //  outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 )
-    //  outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 )
-    F.Sub(&c_x, &box_w_ratio, &outbox0);
-    F.Sub(&c_y, &box_h_ratio, &outbox1);
-    F.Add(&c_x, &box_w_ratio, &outbox2);
-    F.Add(&c_y, &box_h_ratio, &outbox3);
-    F.Muls(&outbox0, static_cast<float>(1.0 / image_w), &outbox0);
-    F.Muls(&outbox1, static_cast<float>(1.0 / image_h), &outbox1);
-    F.Muls(&outbox2, static_cast<float>(1.0 / image_w), &outbox2);
-    F.Muls(&outbox3, static_cast<float>(1.0 / image_h), &outbox3);
-
-    F.Maximum(&outbox0, &zero_t, &outbox0);
-    F.Maximum(&outbox1, &zero_t, &outbox1);
-    F.Minimum(&outbox2, &one_t, &outbox2);
-    F.Minimum(&outbox3, &one_t, &outbox3);
-    if (clip) {
-      //  outbox0 = min ( outbox0, 1 )
-      //  outbox1 = min ( outbox1, 1 )
-      //  outbox2 = max ( outbox2, 0 )
-      //  outbox3 = max ( outbox3, 0 )
-      F.Minimum(&outbox0, &one_t, &outbox0);
-      F.Minimum(&outbox1, &one_t, &outbox1);
-      F.Maximum(&outbox2, &zero_t, &outbox2);
-      F.Maximum(&outbox3, &zero_t, &outbox3);
-    }
-
-    auto out_dim = phi::make_ddim(
-        {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
-    boxes->mutable_data<T>(place);
-    vars->mutable_data<T>(place);
-    phi::DenseTensor boxes_share(_type);
-    phi::DenseTensor vars_share(_type);
-    boxes_share.ShareDataWith(*boxes);
-    boxes_share.Resize(out_dim);
-    vars_share.ShareDataWith(*vars);
-    vars_share.Resize(out_dim);
-
-    phi::DenseTensor box0(_type);
-    phi::DenseTensor box1(_type);
-    phi::DenseTensor box2(_type);
-    phi::DenseTensor box3(_type);
-    // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
-    out_dim[3] = 1;
-    box0.mutable_data<T>(out_dim, place);
-    box1.mutable_data<T>(out_dim, place);
-    box2.mutable_data<T>(out_dim, place);
-    box3.mutable_data<T>(out_dim, place);
-
-    std::vector<int> vec_exp_out02 = {layer_h, 1, 1, 1};
-    std::vector<int> vec_exp_out13 = {1, layer_w, 1, 1};
-    F.Tile(&outbox0, &box0, vec_exp_out02);
-    F.Tile(&outbox1, &box1, vec_exp_out13);
-    F.Tile(&outbox2, &box2, vec_exp_out02);
-    F.Tile(&outbox3, &box3, vec_exp_out13);
-    F.Concat({box0, box1, box2, box3}, 3, &boxes_share);
-
-    std::vector<int> multiples = {
-        layer_h, layer_w, ratios_size * num_priors_per_ratio, 1};
-    phi::DenseTensor variances_t(_type);
-    //  variances.size() == 4
-    variances_t.mutable_data<T>({4}, place);
-    F.FloatVec2Tsr(variances, &variances_t);
-    F.Tile(&variances_t, &vars_share, multiples);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(density_prior_box,
-                       ops::DensityPriorBoxOpNPUKernel<plat::float16>,
-                       ops::DensityPriorBoxOpNPUKernel<float>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
deleted file mode 100644
index 8395e25d46251..0000000000000
--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct IouFunction {
- public:
-  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    //  z should be init first
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-class IouSimilarityNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    bool normalized = ctx.Attr<bool>("box_normalized");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto _type = x->dtype();
-    auto place = ctx.GetPlace();
-
-    IouFunction<T> F(ctx);
-
-    auto N = x->dims()[0];
-    auto M = y->dims()[0];
-
-    out->mutable_data<T>({N, M}, place);
-    phi::DenseTensor xt(_type);
-    phi::DenseTensor yt(_type);
-    xt.mutable_data<T>({4, N}, place);
-    yt.mutable_data<T>({4, M}, place);
-    std::vector<int> vec_trans = {1, 0};
-    F.Transpose(x, &xt, vec_trans);
-    F.Transpose(y, &yt, vec_trans);
-    phi::DenseTensor xmin1 = xt.Slice(0, 1);
-    phi::DenseTensor ymin1 = xt.Slice(1, 2);
-    phi::DenseTensor xmax1 = xt.Slice(2, 3);
-    phi::DenseTensor ymax1 = xt.Slice(3, 4);
-    phi::DenseTensor xmin2 = yt.Slice(0, 1);
-    phi::DenseTensor ymin2 = yt.Slice(1, 2);
-    phi::DenseTensor xmax2 = yt.Slice(2, 3);
-    phi::DenseTensor ymax2 = yt.Slice(3, 4);
-    xmin1.Resize({N, 1});
-    ymin1.Resize({N, 1});
-    xmax1.Resize({N, 1});
-    ymax1.Resize({N, 1});
-    xmin2.Resize({1, M});
-    ymin2.Resize({1, M});
-    xmax2.Resize({1, M});
-    ymax2.Resize({1, M});
-
-    phi::DenseTensor w1(_type);
-    phi::DenseTensor h1(_type);
-    phi::DenseTensor w2(_type);
-    phi::DenseTensor h2(_type);
-    phi::DenseTensor area1(_type);
-    phi::DenseTensor area2(_type);
-    w1.mutable_data<T>({N, 1}, place);
-    h1.mutable_data<T>({N, 1}, place);
-    w2.mutable_data<T>({1, M}, place);
-    h2.mutable_data<T>({1, M}, place);
-    area1.mutable_data<T>({N, 1}, place);
-    area2.mutable_data<T>({1, M}, place);
-    F.Sub(&xmax1, &xmin1, &w1);
-    F.Sub(&ymax1, &ymin1, &h1);
-    F.Sub(&xmax2, &xmin2, &w2);
-    F.Sub(&ymax2, &ymin2, &h2);
-    if (!normalized) {
-      F.Adds(&w1, 1.0f, &w1);
-      F.Adds(&h1, 1.0f, &h1);
-      F.Adds(&w2, 1.0f, &w2);
-      F.Adds(&h2, 1.0f, &h2);
-    }
-    F.Mul(&w1, &h1, &area1);
-    F.Mul(&w2, &h2, &area2);
-
-    phi::DenseTensor inter_xmax(_type);
-    phi::DenseTensor inter_ymax(_type);
-    phi::DenseTensor inter_xmin(_type);
-    phi::DenseTensor inter_ymin(_type);
-    inter_xmax.mutable_data<T>({N, M}, place);
-    inter_ymax.mutable_data<T>({N, M}, place);
-    inter_xmin.mutable_data<T>({N, M}, place);
-    inter_ymin.mutable_data<T>({N, M}, place);
-    F.Minimum(&xmax1, &xmax2, &inter_xmax);
-    F.Minimum(&ymax1, &ymax2, &inter_ymax);
-    F.Maximum(&xmin1, &xmin2, &inter_xmin);
-    F.Maximum(&ymin1, &ymin2, &inter_ymin);
-
-    phi::DenseTensor inter_w(_type);
-    phi::DenseTensor inter_h(_type);
-    inter_w.mutable_data<T>({N, M}, place);
-    inter_h.mutable_data<T>({N, M}, place);
-    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
-    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
-
-    if (!normalized) {
-      F.Adds(&inter_w, 1.0f, &inter_w);
-      F.Adds(&inter_h, 1.0f, &inter_h);
-    }
-    phi::DenseTensor zeros(_type);
-    zeros.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
-    F.Maximum(&inter_w, &zeros, &inter_w);
-    F.Maximum(&inter_h, &zeros, &inter_h);
-
-    F.Mul(&inter_w, &inter_h, out);
-    phi::DenseTensor union_area(_type);
-    union_area.mutable_data<T>({N, M}, place);
-    F.Add(&area1, &area2, &union_area);
-    F.Sub(&union_area, out, &union_area);
-    F.DivNoNan(out, &union_area, out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(iou_similarity,
-                       ops::IouSimilarityNPUKernel<float>,
-                       ops::IouSimilarityNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc
deleted file mode 100644
index 7df68d2bbb1bb..0000000000000
--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PriorBoxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* image = ctx.Input<phi::DenseTensor>("Image");
-    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
-    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
-
-    PADDLE_ENFORCE_EQ(boxes->dims(),
-                      variances->dims(),
-                      platform::errors::Unimplemented(
-                          "the shape of boxes and variances must be same in "
-                          "the npu kernel of prior_box, but got boxes->dims() "
-                          "= [%s], variances->dims() = [%s]",
-                          boxes->dims(),
-                          variances->dims()));
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
-    bool flip = ctx.Attr<bool>("flip");
-    bool clip = ctx.Attr<bool>("clip");
-    float step_w = ctx.Attr<float>("step_w");
-    float step_h = ctx.Attr<float>("step_h");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor out(input->type());
-    auto out_dims = phi::vectorize(boxes->dims());
-    out_dims.insert(out_dims.begin(), 2);
-    out.Resize(phi::make_ddim(out_dims));
-    out.mutable_data<T>(place);
-
-    framework::NPUAttributeMap attr_input = {{"min_size", min_sizes},
-                                             {"max_size", max_sizes},
-                                             {"aspect_ratio", aspect_ratios},
-                                             {"step_h", step_h},
-                                             {"step_w", step_w},
-                                             {"flip", flip},
-                                             {"clip", clip},
-                                             {"offset", offset},
-                                             {"variance", variances_attr}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("PriorBox", {*input, *image}, {out}, attr_input);
-    runner.Run(stream);
-
-    out.Resize(phi::make_ddim({out.numel()}));
-    phi::DenseTensor out_boxes = out.Slice(0, boxes->numel());
-    phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel());
-
-    out_boxes.Resize(boxes->dims());
-    out_variances.Resize(variances->dims());
-
-    boxes->mutable_data<T>(place);
-    variances->mutable_data<T>(place);
-
-    framework::TensorCopy(
-        out_boxes,
-        place,
-        ctx.template device_context<platform::NPUDeviceContext>(),
-        boxes);
-    framework::TensorCopy(
-        out_variances,
-        place,
-        ctx.template device_context<platform::NPUDeviceContext>(),
-        variances);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    prior_box,
-    ops::PriorBoxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PriorBoxNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
deleted file mode 100644
index 9c84961f611c0..0000000000000
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DropoutNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out});
-      runner_zeros_out.Run(stream);
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-      const auto& runner_zeros_mask =
-          NpuOpRunner("ZerosLike", {*mask}, {*mask});
-      runner_zeros_mask.Run(stream);
-      return;
-    }
-
-    // only achieve the default `upscale_in_train` method
-    if (!is_test) {
-      phi::DenseTensor tmp_x(x->dtype());
-      phi::DenseTensor tmp_out(out->dtype());
-      tmp_x.ShareDataWith(*x);
-      tmp_out.ShareDataWith(*out);
-      if (x->dims().size() == 1) {
-        // DropOutDoMask will get error result when input
-        // is 1-D. Make it become 2-D.
-        std::vector<int> vec_dim = phi::vectorize<int>(x->dims());
-        tmp_x.Resize(phi::make_ddim({vec_dim[0], 1}));
-        tmp_out.Resize(phi::make_ddim({vec_dim[0], 1}));
-      }
-
-      int seed = 0;
-      int seed2 = 0;
-      float keep_prob = 1. - dropout_prob;
-      if (seed_tensor) {
-        std::vector<int> seed_data;
-        paddle::framework::TensorToVector(
-            *seed_tensor, ctx.device_context(), &seed_data);
-        seed = seed_data[0];
-      } else {
-        seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
-      }
-
-      phi::DenseTensor keep_prob_tensor(x->dtype());
-      keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&keep_prob_tensor,
-                                   static_cast<T>(keep_prob));
-
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-
-      // mask used in `DropOutGenMask` NPU OP is different from
-      // the output `Mask`.
-      phi::DenseTensor npu_mask(phi::DataType::UINT8);
-      uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
-      npu_mask.Resize(phi::make_ddim({length / 8}));
-      npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
-
-      // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
-      // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
-      // in `npu_op_runner.cc`, which needs to be optimized later.
-      NpuOpRunner runner_gen_mask;
-      runner_gen_mask.SetType("DropOutGenMask")
-          .AddInput(phi::vectorize(tmp_out.dims()))
-          .AddInput(keep_prob_tensor)
-          .AddOutput(npu_mask)
-          .AddAttr("seed", seed)
-          .AddAttr("seed2", seed2);
-      runner_gen_mask.Run(stream);
-
-      NpuOpRunner runner_dropout;
-      runner_dropout.SetType("DropOutDoMask")
-          .AddInput(tmp_x)
-          .AddInput(npu_mask)
-          .AddInput(keep_prob_tensor)
-          .AddOutput(tmp_out);
-      runner_dropout.Run(stream);
-
-      // cast `out` from float/float16 to bool
-      phi::DenseTensor cast_mask(phi::DataType::BOOL);
-      cast_mask.Resize(mask->dims());
-      cast_mask.mutable_data<bool>(ctx.GetPlace());
-      auto dst_dtype_bool =
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype()));
-      const auto& runner_cast_mask_bool =
-          NpuOpRunner("Cast",
-                      {*out},
-                      {cast_mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_bool)}});
-      runner_cast_mask_bool.Run(stream);
-
-      // cast cast_mask from bool to uint8
-      auto dst_dtype_uint8 =
-          ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype()));
-      const auto& runner_cast_mask_uint8 =
-          NpuOpRunner("Cast",
-                      {cast_mask},
-                      {*mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_uint8)}});
-      runner_cast_mask_uint8.Run(stream);
-    } else {
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DropoutGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(is_test,
-                      false,
-                      platform::errors::PreconditionNotMet(
-                          "GradOp is only callable when is_test is false"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx});
-      runner_zeros.Run(stream);
-      return;
-    }
-
-    // cast mask from uint8 to float32/float16
-    phi::DenseTensor cast_mask(dx->dtype());
-    cast_mask.Resize(mask->dims());
-    cast_mask.mutable_data<T>(ctx.GetPlace());
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype()));
-    const auto& runner_cast_mask =
-        NpuOpRunner("Cast",
-                    {*mask},
-                    {cast_mask},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_mask.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("MaskedScale",
-                    {*dout, cast_mask},
-                    {*dx},
-                    {{"value", static_cast<float>(1. / (1 - dropout_prob))}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    dropout,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
index 25b34a2c0a2c3..e0714041dfabe 100644
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
@@ -17,16 +17,3 @@ cc_test(
   test_elementwise_add_grad_grad
   SRCS test_elementwise_add_grad_grad.cc
   DEPS op_registry elementwise_add_op scope device_context enforce executor)
-
-if(WITH_ASCEND_CL)
-  cc_test(
-    elementwise_op_npu_test
-    SRCS elementwise_op_npu_test.cc
-    DEPS op_registry
-         elementwise_add_op
-         elementwise_sub_op
-         scope
-         device_context
-         enforce
-         executor)
-endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
deleted file mode 100644
index 2ae45d5973d2a..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-
-    if (x_dims.size() == y_dims.size()) {
-      direct_compute = true;
-    } else if (x_dims.size() > y_dims.size()) {
-      direct_compute = x_dims.size() == (y_dims.size() + axis);
-    } else {
-      direct_compute = y_dims.size() == (x_dims.size() + axis);
-    }
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename T>
-class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = dev_ctx.stream();
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        auto src_dims = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          phi::DenseTensor tmp;
-          tmp.ShareDataWith(*dx);
-          tmp.Resize(phi::make_ddim(dst_dims_vec));
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {*dout},
-                          {tmp},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec;
-        std::vector<int> reduce_axes;
-        auto src_dims = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          phi::DenseTensor tmp;
-          tmp.ShareDataWith(*dy);
-          tmp.Resize(phi::make_ddim(dst_dims_vec));
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {*dout},
-                          {tmp},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(elementwise_add,
-                       ops::ElementwiseAddNPUKernel<float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseAddNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseAddNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
-                       ops::ElementwiseAddGradNPUKernel<float>,
-                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
deleted file mode 100644
index 259a517a2d32d..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dx) {
-      dx->mutable_data<T>(place);
-
-      phi::DenseTensor tensor_one(y->type());
-      tensor_one.mutable_data<float>({1}, place);
-      FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));
-
-      // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
-      // Because `Power` will cause precision overflow, that is, `float_status`
-      // will be set to 1.
-      phi::DenseTensor y_div(y->type());
-      y_div.mutable_data<T>(y->dims(), place);
-      const auto& runner_one_div_y =
-          NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
-      runner_one_div_y.Run(stream);
-
-      phi::DenseTensor tensor_zeros(x->type());
-      tensor_zeros.mutable_data<T>(x->dims(), place);
-      const auto& runner_tensor_zeros =
-          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
-      runner_tensor_zeros.Run(stream);
-
-      phi::DenseTensor x_zero(phi::DataType::BOOL);
-      x_zero.mutable_data<bool>(x->dims(), place);
-      const auto& runner_x_zero =
-          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
-      runner_x_zero.Run(stream);
-
-      phi::DenseTensor x_nozero(phi::DataType::BOOL);
-      x_nozero.mutable_data<bool>(x->dims(), place);
-      const auto& runner_x_nonzero =
-          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
-      runner_x_nonzero.Run(stream);
-
-      phi::DenseTensor x_nozero_f(x->type());
-      x_nozero_f.mutable_data<T>(x->dims(), place);
-      const auto& runner_x_nonzero_f =
-          NpuOpRunner("Cast",
-                      {x_nozero},
-                      {x_nozero_f},
-                      {{"dst_type", static_cast<int32_t>(0)}});
-      runner_x_nonzero_f.Run(stream);
-
-      phi::DenseTensor x_grad_w(x->type());
-      x_grad_w.mutable_data<T>(x->dims(), place);
-      const auto& runner_x_grad_w =
-          NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
-      runner_x_grad_w.Run(stream);
-
-      const auto& runner_x_grad =
-          NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
-      runner_x_grad.Run(stream);
-    }
-
-    if (dy) {
-      dy->mutable_data<T>(place);
-
-      phi::DenseTensor neg_out(out->type());
-      neg_out.mutable_data<T>(out->dims(), place);
-      const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
-      runner_neg_out.Run(stream);
-
-      phi::DenseTensor tmp_mul(out->type());
-      tmp_mul.mutable_data<T>(out->dims(), place);
-      const auto& runner_mul =
-          NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
-      runner_mul.Run(stream);
-
-      if (dy->dims() != dout->dims()) {
-        phi::DenseTensor reduced_tmp_mul(y->type());
-        reduced_tmp_mul.mutable_data<T>(y->dims(), place);
-
-        std::vector<int64_t> axes;
-        int64_t diff = dout->dims().size() - dy->dims().size();
-        for (int64_t i = 0; i < dout->dims().size(); ++i) {
-          if (i < diff) {
-            axes.push_back(i);
-            continue;
-          }
-          if (dout->dims()[i] > dy->dims()[i - diff]) {
-            axes.push_back(i);
-          }
-        }
-        const auto& runner_reduce =
-            NpuOpRunner("ReduceSumD",
-                        {tmp_mul},
-                        {reduced_tmp_mul},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner_reduce.Run(stream);
-
-        const auto& runner_y_grad =
-            NpuOpRunner("Div", {reduced_tmp_mul, *y}, {*dy}, {});
-        runner_y_grad.Run(stream);
-      } else {
-        const auto& runner_y_grad =
-            NpuOpRunner("Div", {tmp_mul, *y}, {*dy}, {});
-        runner_y_grad.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_div,
-    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_div_grad,
-    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
deleted file mode 100644
index 791c352157781..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
-                       ops::ElementwiseFloorDivNPUKernel<int>,
-                       ops::ElementwiseFloorDivNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
deleted file mode 100644
index 1f3c6229c1854..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ /dev/null
@@ -1,251 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Maximum", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    // The ascend elementwise_max_grad op only supports broadcast
-    // when axis is -1, and requires all the inputs must have the
-    // same shape when axis is not -1. For convenience, we should
-    // broadcast the original input x and y to transformed_x and
-    // transformed_x firstly, then use tmp tensor to get the op
-    // output, last reduce the tmp tensor shape to match the
-    // paddle output.
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    phi::DenseTensor transformed_x, transformed_y;
-    NpuElementWiseOpBroadcast<T>(
-        dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-
-    auto dout_dims = dout->dims();
-    auto stream = dev_ctx.stream();
-    framework::NPUAttributeMap attr_input = {{"grad_x", true},
-                                             {"grad_y", true}};
-    // Reshape info vector.
-    std::vector<int> reduce_axes;
-
-    if (dx && dy) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {tmp_dx, tmp_dy},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
-      }
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
-      }
-
-    } else if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {tmp_dx, zero_tensor},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
-      }
-
-    } else if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("MaximumGrad",
-                                       {*dout, transformed_x, transformed_y},
-                                       {zero_tensor, tmp_dy},
-                                       attr_input);
-      runner.Run(stream);
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
-      }
-    } else {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Do not support all outputs to be empty."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_max,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int>,
-    ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_max_grad,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
deleted file mode 100644
index 18d31430eb242..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    int axis = ctx.Attr<int>("axis");
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-    phi::DenseTensor transformed_x, transformed_y;
-    if (direct_compute) {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    } else {
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-    }
-    const auto& runner =
-        NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = dev_ctx.stream();
-    if (dx && dy) {
-      // dx
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_x;
-      tmp_x.ShareDataWith(*dx);
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_x;
-        std::vector<int> reduce_axes_x;
-        auto src_dims_x = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
-              (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
-            reduce_axes_x.push_back(ax);
-          } else {
-            dst_dims_vec_x.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_x.empty()) {
-          tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
-        }
-      }
-      // dy
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_y;
-      tmp_y.ShareDataWith(*dy);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_y;
-        std::vector<int> reduce_axes_y;
-        auto src_dims_y = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
-              (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
-            reduce_axes_y.push_back(ax);
-          } else {
-            dst_dims_vec_y.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_y.empty()) {
-          tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {tmp_x, tmp_y},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-      // dx
-      dx->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_x;
-      tmp_x.ShareDataWith(*dx);
-      if (dx->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_x;
-        std::vector<int> reduce_axes_x;
-        auto src_dims_x = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
-              (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
-            reduce_axes_x.push_back(ax);
-          } else {
-            dst_dims_vec_x.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_x.empty()) {
-          tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {tmp_x, zero_tensor},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      // dy
-      dy->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor tmp_y;
-      tmp_y.ShareDataWith(*dy);
-      if (dy->dims() != dout->dims()) {
-        std::vector<int> dst_dims_vec_y;
-        std::vector<int> reduce_axes_y;
-        auto src_dims_y = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
-              (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
-            reduce_axes_y.push_back(ax);
-          } else {
-            dst_dims_vec_y.push_back(dout_dims[ax]);
-          }
-        }
-        if (!reduce_axes_y.empty()) {
-          tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
-        }
-      }
-
-      const auto& runner = NpuOpRunner("MinimumGrad",
-                                       {*dout, *x, *y},
-                                       {zero_tensor, tmp_y},
-                                       {{"grad_x", true}, {"grad_y", true}});
-      runner.Run(stream);
-
-    } else {
-      std::cout << "error" << std::endl;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_min,
-    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_min_grad,
-    ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
deleted file mode 100644
index da7895b2481fe..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseModNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-
-    bool direct_compute = false;
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    phi::DenseTensor transformed_x, transformed_y;
-    if (direct_compute) {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    } else {
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("FloorMod", {transformed_x, transformed_y}, {*out}, {});
-    auto stream = dev_ctx.stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_mod,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
deleted file mode 100644
index 9af1293d672fb..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const int axis,
-                       const framework::DDim& ddims,
-                       const framework::DDim& brd_ddims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t brd_size = brd_ddims.size();
-  int64_t org_size = ddims.size();
-  // int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < brd_size; ++i) {
-    if (i < axis || i >= org_size + axis) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_ddims[i] > ddims[i - axis]) {
-      axes.push_back(i);
-    }
-  }
-  // LOG(INFO) << "axes = " << phi::make_ddim(axes).to_str();
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = x_dims.size() == (y_dims.size() + axis);
-    } else {
-      direct_compute = y_dims.size() == (x_dims.size() + axis);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor trans_x, trans_y;
-      NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
-      const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-
-    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    phi::DenseTensor trans_x, trans_y;
-    NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
-
-    if (dx) {
-      if (dx->dims() == dout->dims()) {
-        dx->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
-        runner_dx.Run(stream);
-      } else {
-        phi::DenseTensor dx_temp(x->type());
-        dx_temp.Resize(trans_x.dims());
-        dx_temp.mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx =
-            NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {});
-        runner_dx.Run(stream);
-        ReduceDims<T>(
-            ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, dx);
-      }
-    }
-    if (dy) {
-      if (dy->dims() == dout->dims()) {
-        dy->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
-        runner_dy.Run(stream);
-      } else {
-        phi::DenseTensor dy_temp(y->type());
-        dy_temp.Resize(trans_y.dims());
-        dy_temp.mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy =
-            NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {});
-        runner_dy.Run(stream);
-        ReduceDims<T>(
-            ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, dy);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(elementwise_mul,
-                       ops::ElementwiseMulNPUKernel<float>,
-                       ops::ElementwiseMulNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseMulNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseMulNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradNPUKernel<float>,
-    ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ElementwiseMulGradNPUKernel<int64_t>,
-#endif
-    ops::ElementwiseMulGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h
deleted file mode 100644
index 9d31036e0c924..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                  const phi::DenseTensor* src,
-                  int axis,
-                  const framework::DDim& dst_dims,
-                  phi::DenseTensor* transformed_src) {
-  auto stream = dev_ctx.stream();
-
-  // 1. expand the axis with dim 1
-  auto src_dims = src->dims();
-  phi::DenseTensor tmp_src;
-  tmp_src.ShareDataWith(*src);
-  tmp_src.Resize(src_dims);
-  for (int i = 0; i < src_dims.size(); ++i) {
-    if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
-      phi::DenseTensor tmp_tensor;
-      auto tmp_tensor_dims = tmp_src.dims();
-      tmp_tensor_dims[i] = dst_dims[i + axis];
-      tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("TileWithAxis",
-                      {tmp_src},
-                      {tmp_tensor},
-                      {{"axis", static_cast<int64_t>(i)},
-                       {"tiles", static_cast<int64_t>(dst_dims[i + axis])}});
-      runner.Run(stream);
-      tmp_src.ShareDataWith(tmp_tensor);
-      tmp_src.Resize(tmp_tensor_dims);
-    }
-  }
-
-  // 2.expand the ahead axis
-  auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis));
-  if (prev > 1) {
-    phi::DenseTensor tmp_tensor;
-    auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size());
-    tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("ExpandD",
-                    {tmp_src},
-                    {tmp_tensor},
-                    {{"shape", phi::vectorize<int64_t>(tmp_tensor_dims)}});
-    runner.Run(stream);
-    tmp_src.ShareDataWith(tmp_tensor);
-    tmp_src.Resize(tmp_tensor_dims);
-  } else {
-    tmp_src.Resize(phi::slice_ddim(dst_dims, 0, axis + src_dims.size()));
-  }
-
-  // 3.expand the tail axis
-  auto post = phi::product(
-      phi::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size()));
-  if (post > 1) {
-    auto src_dims_vec = phi::vectorize<int>(tmp_src.dims());
-    src_dims_vec.push_back(1);
-    tmp_src.Resize(phi::make_ddim(src_dims_vec));
-
-    phi::DenseTensor tmp_tensor;
-    tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("TileWithAxis",
-                    {tmp_src},
-                    {tmp_tensor},
-                    {{"axis", static_cast<int64_t>(axis + src_dims.size())},
-                     {"tiles", static_cast<int64_t>(post)}});
-    runner.Run(stream);
-    tmp_src.ShareDataWith(tmp_tensor);
-  }
-  tmp_src.Resize(dst_dims);
-  framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src);
-}
-
-template <typename T>
-void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx,
-                               const phi::DenseTensor* x,
-                               const phi::DenseTensor* y,
-                               int axis,
-                               phi::DenseTensor* transformed_x,
-                               phi::DenseTensor* transformed_y) {
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  std::vector<int> dst_dims_vec = phi::vectorize<int>(x_dims);
-
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-    dst_dims_vec = phi::vectorize<int>(y_dims);
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  int x_axis = is_xsize_larger ? 0 : axis;
-  int y_axis = is_xsize_larger ? axis : 0;
-
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LE(
-      axis,
-      max_dim,
-      platform::errors::InvalidArgument(
-          "Axis should be less than or equal to %d, but received axis is %d.",
-          max_dim,
-          axis));
-
-  for (int i = 0; i < x_dims.size(); ++i) {
-    dst_dims_vec[i + x_axis] =
-        std::max(dst_dims_vec[i + x_axis], static_cast<int>(x_dims[i]));
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    dst_dims_vec[i + y_axis] =
-        std::max(dst_dims_vec[i + y_axis], static_cast<int>(y_dims[i]));
-  }
-
-  auto dst_dims = phi::make_ddim(dst_dims_vec);
-  NpuBroadcast<T>(dev_ctx, x, x_axis, dst_dims, transformed_x);
-  NpuBroadcast<T>(dev_ctx, y, y_axis, dst_dims, transformed_y);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
deleted file mode 100644
index 0a8972ac4792f..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP_ITSELF(elementwise_sub);
-USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
-
-template <typename T>
-void Compare(f::Scope *scope,
-             const p::DeviceContext &ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  std::vector<T> init_y;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_y.push_back(static_cast<T>(2.0));
-  }
-
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
-  tensor_y->Resize({10, 10});
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"X"}}, {"Y", {"Y"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-  float expected = 0.0;
-  if (op_type == "elementwise_add") {
-    expected = 3.0;
-  } else if (op_type == "elementwise_sub") {
-    expected = -1.0;
-  }
-  EXPECT_EQ(out_vec.size(), init_x.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope *scope,
-                 const p::DeviceContext &ctx,
-                 std::string op_type) {
-  // init
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-  tensor_dout->Resize({2, 3, 5});
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  tensor_x->Resize({2, 3, 5});
-
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-  tensor_y->Resize({1, 5});
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  auto dy = scope->Var("DY");
-  auto tensor_dy = dy->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_dout;
-  for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
-    init_dout.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize({2, 3, 5});
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
-      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}},
-      attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  std::vector<T> dy_vec;
-  paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
-
-  ctx.Wait();
-  float expected_x = 0, expected_y = 0;
-  if (op_type == "elementwise_add_grad") {
-    expected_x = 1.0;
-    expected_y = 6.0;
-  } else if (op_type == "elementwise_sub_grad") {
-    expected_x = 1.0;
-    expected_y = -6.0;
-  }
-
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
-  }
-  for (uint32_t i = 0; i < dy_vec.size(); i++) {
-    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
-  }
-}
-
-TEST(elementwise_add, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "elementwise_add");
-}
-
-TEST(elementwise_sub, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "elementwise_sub");
-}
-
-TEST(elementwise_sub, NPU_fp16) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
-}
-
-TEST(elementwise_sub_grad, NPU) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
-}
-
-TEST(elementwise_add_grad, NPU) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
-}
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
deleted file mode 100644
index d0cf1ac28b1c6..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwisePowNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    int axis = ctx.Attr<int>("axis");
-
-    out->mutable_data<T>(place);
-
-    bool direct_compute = false;
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    if (x_dims.size() >= y_dims.size()) {
-      direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
-    } else {
-      direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
-    }
-
-    auto stream = dev_ctx.stream();
-
-    if (direct_compute) {
-      const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transformed_x, transformed_y;
-      NpuElementWiseOpBroadcast<T>(
-          dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-      const auto& runner =
-          NpuOpRunner("Pow", {transformed_x, transformed_y}, {*out}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    auto place = ctx.GetPlace();
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-    axis =
-        (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    phi::DenseTensor transformed_x, transformed_y;
-    NpuElementWiseOpBroadcast<T>(
-        dev_ctx, x, y, axis, &transformed_x, &transformed_y);
-
-    auto dout_dims = dout->dims();
-    auto stream = dev_ctx.stream();
-    // Reshape info vector.
-    std::vector<int> reduce_axes;
-    if (dx) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, place);
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dx->mutable_data<T>(place);
-      phi::DenseTensor tmp_dx;
-      tmp_dx.mutable_data<T>(dout_dims, place);
-
-      // dx = dout * y * pow(x, y - 1);
-      phi::DenseTensor PowGrad_dx_temp1(dout->type());
-      PowGrad_dx_temp1.mutable_data<T>(dout->dims(), place);
-      const auto& runner_PowGrad_dx_temp1 =
-          NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {});
-      runner_PowGrad_dx_temp1.Run(stream);
-
-      phi::DenseTensor one_dx(transformed_y.type());
-      one_dx.mutable_data<T>(transformed_y.dims(), place);
-      const auto& runner_one_dx =
-          NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {});
-      runner_one_dx.Run(stream);
-
-      phi::DenseTensor sub_dx(transformed_y.type());
-      sub_dx.mutable_data<T>(transformed_y.dims(), place);
-      const auto& runner_sub_dx =
-          NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {});
-      runner_sub_dx.Run(stream);
-
-      phi::DenseTensor PowGrad_dx_temp2(transformed_x.type());
-      PowGrad_dx_temp2.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dx_temp2 =
-          NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {});
-      runner_PowGrad_dx_temp2.Run(stream);
-
-      const auto& runner_dx = NpuOpRunner(
-          "Mul", {PowGrad_dx_temp1, PowGrad_dx_temp2}, {tmp_dx}, {});
-      runner_dx.Run(stream);
-
-      if (x_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
-              (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dx},
-                          {*dx},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dx, place, dev_ctx, dx);
-      }
-    }
-    if (dy) {
-      phi::DenseTensor zero_tensor(dout->type());
-      zero_tensor.mutable_data<T>(dout_dims, place);
-      FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
-
-      dy->mutable_data<T>(place);
-      phi::DenseTensor tmp_dy;
-      tmp_dy.mutable_data<T>(dout_dims, place);
-
-      // dy = dout * log(x) * pow(x, y)
-      phi::DenseTensor PowGrad_dy_temp1(transformed_x.type());
-      PowGrad_dy_temp1.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dy_temp1 = NpuOpRunner(
-          "Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {});
-      runner_PowGrad_dy_temp1.Run(stream);
-
-      phi::DenseTensor one_dy(transformed_x.type());
-      one_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_one_dy =
-          NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {});
-      runner_one_dy.Run(stream);
-
-      phi::DenseTensor sub_dy(transformed_x.type());
-      sub_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_sub_dy =
-          NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {});
-      runner_sub_dy.Run(stream);
-
-      phi::DenseTensor log_dy(transformed_x.type());
-      log_dy.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {});
-      runner_log_dy.Run(stream);
-
-      phi::DenseTensor PowGrad_dy_temp2(transformed_x.type());
-      PowGrad_dy_temp2.mutable_data<T>(transformed_x.dims(), place);
-      const auto& runner_PowGrad_dy_temp2 = NpuOpRunner(
-          "Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {});
-      runner_PowGrad_dy_temp2.Run(stream);
-
-      const auto& runner_dy =
-          NpuOpRunner("Mul", {*dout, PowGrad_dy_temp2}, {tmp_dy}, {});
-      runner_dy.Run(stream);
-
-      if (y_dims != dout_dims) {
-        reduce_axes.clear();
-
-        int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
-              (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          }
-        }
-        if (!reduce_axes.empty()) {
-          const auto& runner =
-              NpuOpRunner("ReduceSumD",
-                          {tmp_dy},
-                          {*dy},
-                          {{"axes", reduce_axes}, {"keep_dims", false}});
-          runner.Run(stream);
-        }
-      } else {
-        framework::TensorCopy(tmp_dy, place, dev_ctx, dy);
-      }
-    }
-    if (!dx && !dy) {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "Not support all outputs to be empty."));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_pow,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(
-    elementwise_pow_grad,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
deleted file mode 100644
index 2b9d83cc57d97..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
-    // default axis=-1?
-    // So, the sub_grad should do reduce if needed.
-    // For example, the shape of each variable in elementwise_sub:
-    // x, dx: [2, 3, 5]
-    // y, dy: [1, 5]
-    // out, dout: [2, 3, 5]
-    // Then, out = x - y  =>  dx = dout, dy = -dout
-    // And, the shape of dy can be computed by two stages reduce,
-    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
-    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
-
-    if (dx) {
-      dx->mutable_data<T>(ctx.GetPlace());
-      // For dx
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dx->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      phi::DenseTensor reduced_dout(dx->type());
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
-        }
-        reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD",
-                        {*dout},
-                        {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      for (auto i = 0; i < dx->dims().size(); ++i) {
-        if (dx->dims()[i] == 1) {
-          axes.push_back(i);
-        }
-      }
-      if (axes.size() != 0) {
-        const auto& runner = NpuOpRunner("ReduceSumD",
-                                         {*tmp_dout},
-                                         {*dx},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
-      } else {
-        framework::TensorCopy(
-            *tmp_dout,
-            ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(),
-            dx);
-      }
-    }
-    if (dy) {
-      dy->mutable_data<T>(ctx.GetPlace());
-      // For dy
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dy->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      phi::DenseTensor reduced_dy(dy->type());
-      phi::DenseTensor reduced_dout(dy->type());
-
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
-        }
-        reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD",
-                        {*dout},
-                        {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      phi::DenseTensor* tmp_dy = tmp_dout;
-      for (auto i = 0; i < dy->dims().size(); ++i) {
-        if (dy->dims()[i] == 1) {
-          axes.push_back(i);
-        }
-      }
-      if (axes.size() != 0) {
-        reduced_dy.Resize(dy->dims());
-        reduced_dy.mutable_data<T>(ctx.GetPlace());
-        const auto& runner = NpuOpRunner("ReduceSumD",
-                                         {*tmp_dout},
-                                         {reduced_dy},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
-        tmp_dy = &reduced_dy;
-      }
-
-      // stage 3, negative
-      const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(elementwise_sub,
-                       ops::ElementwiseSubNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseSubNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseSubNPUKernel<float>,
-                       ops::ElementwiseSubNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
-                       ops::ElementwiseSubGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ElementwiseSubGradNPUKernel<int64_t>,
-#endif
-                       ops::ElementwiseSubGradNPUKernel<float>,
-                       ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
deleted file mode 100644
index 77f12f17ce258..0000000000000
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank,
-                      rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank,
-                          rank));
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
-                                          "expand_as_v2 op must be positive.",
-                                          rank));
-    PADDLE_ENFORCE_LE(target_rank,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank,
-                          MAX_RANK_SUPPORTED));
-    ExpandAs(context);
-  }
-
- protected:
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            target_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i],
-                target_shape[i]));
-      }
-    }
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}});
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
deleted file mode 100644
index d7e553b83bb67..0000000000000
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be greater than or equal to 1, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    switch (rank) {
-      case 1:
-        Expand<1>(context);
-        break;
-      case 2:
-        Expand<2>(context);
-        break;
-      case 3:
-        Expand<3>(context);
-        break;
-      case 4:
-        Expand<4>(context);
-        break;
-      case 5:
-        Expand<5>(context);
-        break;
-      case 6:
-        Expand<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto expand_times = get_expand_times(context);
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()),
-                      expand_times.size(),
-                      platform::errors::InvalidArgument(
-                          "The number of elements (%d) of 'expand_times' for "
-                          "Op(expand) must be equal to the number "
-                          "of dimensions (%d) of the input.",
-                          expand_times.size(),
-                          static_cast<size_t>(in_dims.size())));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    framework::DDim out_dims(in_dims);
-
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      out_dims[i] *= expand_times[i];
-    }
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(place);
-
-    bool is_expand_times_all_one =
-        (out0->numel() == in0->numel()) ? true : false;
-
-    if (is_expand_times_all_one) {
-      memory::Copy(place,
-                   out0->mutable_data<T>(place),
-                   place,
-                   in0->data<T>(),
-                   in0->numel() * sizeof(T),
-                   stream);
-      if (out_dims != in_dims) {
-        out0->Resize(out_dims);
-      }
-    } else {
-      const auto& runner =
-          NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
deleted file mode 100644
index e9d12beaa78de..0000000000000
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <iostream>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(expand);
-USE_OP_DEVICE_KERNEL(expand, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto in = scope->Var("X");
-  auto expand_times = scope->Var("ExpandTimes");
-  auto out = scope->Var("Out");
-  auto in_t = in->GetMutable<phi::DenseTensor>();
-  auto out_t = out->GetMutable<phi::DenseTensor>();
-  auto expand_times_t = expand_times->GetMutable<phi::DenseTensor>();
-
-  auto place = ctx.GetPlace();
-  paddle::framework::TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
-  paddle::framework::TensorFromVector(
-      std::vector<int>({1, 10, 1}), ctx, expand_times_t);
-
-  in_t->Resize(phi::make_ddim({3, 1, 7}));
-  expand_times_t->Resize(phi::make_ddim({3}));
-  out_t->Resize(phi::make_ddim({3, 10, 7}));
-  out_t->mutable_data<T>(place);
-
-  f::AttributeMap attrs = {{}};
-  auto op =
-      f::OpRegistry::CreateOp("expand",
-                              {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
-                              {{"Out", {"Out"}}},
-                              attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  auto out_dim = out_t->dims();
-  EXPECT_EQ(out_dim.at(0), 3);
-  EXPECT_EQ(out_dim.at(1), 10);
-  EXPECT_EQ(out_dim.at(2), 7);
-}
-
-TEST(expand, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
deleted file mode 100644
index 7f37fc67d529d..0000000000000
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto in_dims = X->dims();
-    auto expand_shape = get_expand_shape(ctx);
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> final_expand_shape(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
-                       // final_expand_shape = [3,4,10,2]
-        PADDLE_ENFORCE_GT(
-            expand_shape[i],
-            0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        final_expand_shape[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
-                                         // [10,1] --> final_expand_shape =
-                                         // [3,4,10,4]
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i],
-              expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i],
-                  expand_shape[i]));
-          final_expand_shape[i] = expand_shape[i];
-        } else {
-          final_expand_shape[i] = expand_shape[i];
-        }
-      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
-                // = [3,4,10,2]
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i],
-            -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        final_expand_shape[i] = vec_in_dims[i];
-      }
-    }
-
-    framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}};
-
-    auto rank = X->dims().size();
-
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be positive, "
-            "but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be less than "
-            "or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto shape_size = final_expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size,
-        rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2_npu op must "
-            "be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size,
-            rank));
-    PADDLE_ENFORCE_LE(shape_size,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The number (%d) of elements of 'shape' for "
-                          "expand_v2_npu op must be "
-                          "less than or equal to %d.",
-                          shape_size,
-                          MAX_RANK_SUPPORTED));
-
-    framework::DDim out_dims = phi::make_ddim(final_expand_shape);
-    Out->Resize(out_dims);
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                      const std::vector<phi::DenseTensor>& outputs,
-                      const NPUAttributeMap& attrs,
-                      const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(X->dtype()) ==
-        framework::proto::VarType::BOOL) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::UINT8},
-                               {framework::proto::VarType::UINT8});
-    } else if (framework::TransToProtoVarType(X->dtype()) ==
-               framework::proto::VarType::INT64) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // case 1: reduce dout dims to dx dims
-    // For example: [2, 120] --> [120]
-    auto reduce_ndim = dout->dims().size() - dx->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-
-    phi::DenseTensor tmp_dout(dout->dtype());
-    phi::DenseTensor reduced_dout(dx->dtype());
-    tmp_dout.ShareDataWith(*dout);
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
-      }
-      tmp_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {*dout},
-                                       {reduced_dout},
-                                       {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = reduced_dout;
-    }
-
-    // case 2: reduce axis of dout in which dim is 1
-    // For example: [12, 140] --> [1, 140]
-
-    // case 3: copy dout to dx when shape is totally same, and dim in dx != 1
-    // For example: [2, 10, 5] --> [2, 10, 5]
-    axes.clear();
-    for (auto i = 0; i < dx->dims().size(); ++i) {
-      if (dx->dims()[i] == 1) {
-        axes.push_back(i);
-      }
-    }
-    if (axes.size() != 0) {
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {tmp_dout},
-                                       {*dx},
-                                       {{"axes", axes}, {"keep_dims", true}});
-      runner.Run(stream);
-    } else {
-      framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_v2,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
deleted file mode 100644
index ee71ebee9b066..0000000000000
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class EyeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-
-    auto d_nums = ctx.Attr<int>("dtype");
-    auto dtype =
-        ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(d_nums));
-
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    framework::NPUAttributeMap attr_input = {
-        {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    eye,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
deleted file mode 100644
index 62d3e5a82f5a3..0000000000000
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
-    tensor_tmp.mutable_data<T>({1}, context.GetPlace());
-    FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto shape = out->dims();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(shape))
-        .AddInput(tensor_tmp)
-        .AddOutput(*out)
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::FillAnyLikeNPUKernel<int64_t>,
-#endif
-                       ops::FillAnyLikeNPUKernel<float>,
-                       ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
deleted file mode 100644
index fed75fc018a0c..0000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto *in = ctx.Input<phi::DenseTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the phi::DenseTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              out,
-              static_cast<T>(value));
-    } else {
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
-      tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_tmp, value);
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      const auto &runner = NpuOpRunner("FillD",
-                                       {tensor_tmp},
-                                       {*out},
-                                       {{"dims", phi::vectorize(out->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           float>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           int>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
deleted file mode 100644
index 0724caf32793e..0000000000000
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillConstantNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-
-    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    auto shape = GetShape(ctx);
-
-    out_var->mutable_data<T>(shape, ctx.GetPlace());
-    if (data_type != framework::proto::VarType::BOOL) {
-      Tensor tensor_value(framework::TransToPhiDataType(data_type));
-      tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_value, value);
-      NpuOpRunner runner;
-      runner.SetType("Fill")
-          .AddInput(phi::vectorize(shape))
-          .AddInput(tensor_value)
-          .AddOutput(*out_var)
-          .Run(stream);
-    } else {
-      const auto &dev_ctx =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>();
-      auto op_func = [&shape, &value](
-                         const std::vector<Tensor> &inputs,
-                         const std::vector<Tensor> &outputs,
-                         const NPUAttributeMap &attrs,
-                         const platform::NPUDeviceContext &dev_ctx) {
-        Tensor tensor_value;
-        tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
-        FillNpuTensorWithConstant<uint8_t>(&tensor_value,
-                                           static_cast<uint8_t>(value));
-
-        NpuOpRunner runner;
-        runner.SetType("Fill")
-            .AddInput(phi::vectorize(shape))
-            .AddInput(tensor_value)
-            .AddOutput(outputs[0])
-            .Run(dev_ctx.stream());
-      };
-      NpuOpRunner::TypeAdapter({},
-                               {*out_var},
-                               {},
-                               dev_ctx,
-                               op_func,
-                               {},
-                               {framework::proto::VarType::UINT8});
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    fill_constant,
-    paddle::operators::FillConstantNPUKernel<float>,
-    paddle::operators::FillConstantNPUKernel<bool>,
-    paddle::operators::FillConstantNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::FillConstantNPUKernel<int64_t>,
-#endif
-    paddle::operators::FillConstantNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op_npu.cc b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
deleted file mode 100644
index 6cedc658f76f5..0000000000000
--- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_zeros_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillZerosLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
deleted file mode 100644
index 2e43c33efd575..0000000000000
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/flatten_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class Flatten2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto &axis = context.Attr<int>("axis");
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::NPUAttributeMap attr_input = {{"axis", axis}};
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Flatten2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<phi::DenseTensor>("X");
-    auto *Out = ctx.Output<phi::DenseTensor>("Out");
-    int start_axis = ctx.Attr<int>("start_axis");
-    int stop_axis = ctx.Attr<int>("stop_axis");
-
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    const auto &runner =
-        NpuOpRunner("FlattenV2",
-                    {*X},
-                    {*Out},
-                    {{"axis", static_cast<int32_t>(start_axis)},
-                     {"end_axis", static_cast<int32_t>(stop_axis)}});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(flatten2,
-                       ops::Flatten2NPUKernel<float>,
-                       ops::Flatten2NPUKernel<double>,
-                       ops::Flatten2NPUKernel<uint8_t>,
-                       ops::Flatten2NPUKernel<int>,
-                       ops::Flatten2NPUKernel<int8_t>,
-                       ops::Flatten2NPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(flatten2_grad,
-                       ops::Flatten2GradNPUKernel<float>,
-                       ops::Flatten2GradNPUKernel<double>,
-                       ops::Flatten2GradNPUKernel<uint8_t>,
-                       ops::Flatten2GradNPUKernel<int>,
-                       ops::Flatten2GradNPUKernel<int8_t>,
-                       ops::Flatten2GradNPUKernel<int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    flatten_contiguous_range,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         float>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         double>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         uint8_t>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int8_t>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    flatten_contiguous_range_grad,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             float>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             double>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             uint8_t>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int8_t>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int64_t>);
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
deleted file mode 100644
index feb1567e58d78..0000000000000
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class GatherNdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->template mutable_data<T>(ctx.GetPlace());
-
-    if (x->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
-      return;
-    }
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {});
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class GatherNdGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *p = dx->mutable_data<T>(ctx.GetPlace());
-
-    if (dx->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx);
-      return;
-    }
-
-    phi::DenseTensor tmp_tensor(index->type());
-    phi::DenseTensor tmp_tensor2(dout->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {1, index_dims[0]};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-
-      tmp_tensor2.ShareDataWith(*dout);
-      std::vector<int64_t> new_dim2{1};
-      for (int i = index->numel(); i < x->dims().size(); i++) {
-        new_dim2.push_back(x->dims()[i]);
-      }
-      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
-      dout = &tmp_tensor2;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    platform::NPUMemsetAsync(
-        static_cast<void *>(p), 0, dx->numel() * sizeof(T), stream);
-
-    const auto &runner_scatter = NpuOpRunner(
-        "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}});
-    runner_scatter.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(gather_nd,
-                       ops::GatherNdNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::GatherNdNPUKernel<int64_t>,
-#endif
-                       ops::GatherNdNPUKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(gather_nd_grad,
-                       ops::GatherNdGradNPUKernel<paddle::platform::float16>,
-                       ops::GatherNdGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
deleted file mode 100644
index ab42d78a0a1d7..0000000000000
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GatherOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto &runner = NpuOpRunner(
-        "Gather", {*x, *index}, {*out}, {{"validate_indices", true}});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GatherGradOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    // step1: Unsqueeze index
-    phi::DenseTensor tmp_tensor(index->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {index_dims[0], 1};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // step2: ZerosLike x in device
-    Tensor zeroslike_xout(dx->type());
-    zeroslike_xout.Resize(x->dims());
-    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
-
-    platform::NPUMemsetAsync(
-        static_cast<void *>(p), 0, zeroslike_xout.numel() * sizeof(T), stream);
-
-    // step3: scatter(x_grad)
-    const auto &runner_scatter = NpuOpRunner(
-        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
-    runner_scatter.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    gather,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    gather_grad,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
deleted file mode 100644
index 69d82ecaedeea..0000000000000
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(gather);
-USE_OP_DEVICE_KERNEL(gather, NPU);
-USE_OP_ITSELF(gather_grad);
-USE_OP_DEVICE_KERNEL(gather_grad, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto index = scope->Var("Index");
-  auto tensor_index = index->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 1; i < 7; ++i) {
-    // 1,2,3,4,5,6
-    init_x.push_back(static_cast<T>(i));
-  }
-
-  // [[1, 2],[3, 4],[5, 6]]
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({3, 2}));
-
-  std::vector<int> init_index = {1, 2};
-  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
-  tensor_index->Resize(phi::make_ddim({2}));
-
-  ctx.Wait();
-
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs = {{"validate_indices", true}};
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
-  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
-  }
-  uint32_t expected_size = 4;
-  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
-
-  // {3, 4, 5, 6}
-  std::vector<T> expected_out_vec;
-  for (int64_t i = 3; i < 7; ++i) {
-    expected_out_vec.push_back(static_cast<T>(i));
-  }
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope,
-                 const p::DeviceContext& ctx,
-                 std::string op_type) {
-  // init
-  auto index = scope->Var("Index");
-  auto tensor_index = index->GetMutable<phi::DenseTensor>();
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> init_index = {0, 1};
-  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
-  tensor_index->Resize(phi::make_ddim({2}));
-
-  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({3, 2}));
-
-  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize(phi::make_ddim({2, 2}));
-
-  ctx.Wait();
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
-      {{"X@GRAD", {"DX"}}},
-      attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  ctx.Wait();
-
-  uint32_t expected_size = 3 * 2;
-  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
-
-  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
-    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
-  }
-}
-
-TEST(gather, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "gather");
-}
-
-TEST(gather, NPU_fp16) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx, "gather");
-}
-
-TEST(gather_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "gather_grad");
-}
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
deleted file mode 100644
index 9b3c23ad2b9c1..0000000000000
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::normal_distribution<T> dist(mean, std);
-
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = dist(*engine);
-    }
-    framework::TensorCopy(
-        cpu_tensor,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        tensor);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(gaussian_random, ops::NPUGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
deleted file mode 100644
index 1b40a6fbb454c..0000000000000
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GeluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(pangyoki): In the original implementation of GeluGrad op, the input
-    // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable
-    // `out` was not actually used. In order to improve performance, the
-    // useless GELU operation was deleted.
-    // We directly use `*dout` as a placeholder to replace `out`, it will not
-    // be used in calculations.
-    const auto& runner_dx =
-        NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    gelu,
-    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    gelu_grad,
-    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
deleted file mode 100644
index 9dca0bb8cba0f..0000000000000
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(gelu);
-USE_OP_DEVICE_KERNEL(gelu, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attrs;
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-
-  auto op = f::OpRegistry::CreateOp(
-      "gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-
-  ctx.Wait();
-
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-
-  // eval value
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  float expected = 0.841192;
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_dout;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_dout.push_back(static_cast<T>(1.0));
-  }
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attrs;
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-
-  auto op = f::OpRegistry::CreateOp("gelu_grad",
-                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
-                                    {{"X@GRAD", {"DX"}}},
-                                    attrs);
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-
-  ctx.Wait();
-
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-
-  // eval value
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  float expected = 1.082964;
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(dx_vec[i], static_cast<T>(expected));
-  }
-}
-
-TEST(gelu, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(gelu_grad, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
deleted file mode 100644
index 49fdd3566825b..0000000000000
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GroupNormFunction {
- public:
-  explicit GroupNormFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  void ReduceMean(const phi::DenseTensor* x,
-                  phi::DenseTensor* y,
-                  const std::vector<int>& dim,
-                  bool keep_dims = true) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner(
-        "ReduceMeanD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void ReduceSum(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& dim,
-                 bool keep_dims = true) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner(
-        "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Div(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Sqrt(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x,
-                                  const DataLayout& data_layout,
-                                  const int64_t N,
-                                  const int64_t C,
-                                  const int64_t H,
-                                  const int64_t W,
-                                  const int G) {
-    phi::DenseTensor y(x->type());
-    // y.mutable_data<T>( {N,G,1}, place );
-    if (data_layout == DataLayout::kNCHW) {
-      y.mutable_data<T>({N, G, 1}, place);
-      //  shape of x is [N, G, C*H*W/G]
-      this->ReduceMean(x, &y, std::vector<int>{2});
-    } else {
-      y.mutable_data<T>({N, 1, G}, place);
-      //  shape of x is [N, C*H*W/G, G]
-      phi::DenseTensor x_trans(x->type());
-      x_trans.mutable_data<T>({N, G, C * H * W / G}, place);
-      this->Transpose(x, &x_trans, std::vector<int>{0, 2, 1});
-      this->ReduceMean(&x_trans, &y, std::vector<int>{2});
-    }
-    return y;
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-class GroupNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* var = ctx.Output<phi::DenseTensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    auto place = ctx.GetPlace();
-    phi::DenseTensor xnorm(x->type());
-    xnorm.mutable_data<T>(x->dims(), place);
-    GroupNormFunction<T> F(ctx);
-    if (data_layout != DataLayout::kNCHW) {
-      xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]});
-      F.Transpose(x, &xnorm, std::vector<int>{0, 3, 1, 2});
-    } else {
-      paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm);
-    }
-    auto N = xnorm.dims()[0];
-    auto C = xnorm.dims()[1];
-    auto H = xnorm.dims()[2];
-    auto W = xnorm.dims()[3];
-    xnorm.Resize({N * groups, C * H * W / groups});
-    std::vector<int> axis = {1};
-    auto reduce_dim = mean->dims();
-
-    mean->mutable_data<T>({N * groups, 1}, place);
-    var->mutable_data<T>({N * groups, 1}, place);
-    y->mutable_data<T>(place);
-    F.ReduceMean(&xnorm, mean, axis);
-
-    F.Sub(&xnorm, mean, &xnorm);
-    phi::DenseTensor sqr(x->type());
-    sqr.mutable_data<T>(xnorm.dims(), place);
-
-    F.Mul(&xnorm, &xnorm, &sqr);
-    F.ReduceMean(&sqr, var, axis);
-    phi::DenseTensor std(x->type());
-    std.mutable_data<T>(var->dims(), place);
-    F.Adds(var, epsilon, &std);
-    F.Sqrt(&std, &std);
-    y->Resize(xnorm.dims());
-    F.Div(&xnorm, &std, y);
-    y->Resize({N, C, H, W});
-    if (scale) {
-      phi::DenseTensor scale_t(scale->type());
-      scale_t.ShareDataWith(*scale);
-      scale_t.Resize({C, 1, 1});
-      F.Mul(y, &scale_t, y);
-    }
-    if (bias) {
-      phi::DenseTensor bias_t(bias->type());
-      bias_t.ShareDataWith(*bias);
-      bias_t.Resize({C, 1, 1});
-      F.Add(y, &bias_t, y);
-    }
-    if (data_layout != DataLayout::kNCHW) {
-      F.Transpose(y, y, std::vector<int>{0, 2, 3, 1});
-      y->Resize({x->dims()});
-    }
-    mean->Resize(reduce_dim);
-    var->Resize(reduce_dim);
-  }
-};
-
-template <typename T>
-class GroupNormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* var = ctx.Input<phi::DenseTensor>("Variance");
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto G = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    GroupNormFunction<T> F(ctx);
-    auto place = ctx.GetPlace();
-    auto _type = y->type();
-
-    phi::DenseTensor xnorm(_type);
-    xnorm.mutable_data<T>(y->dims(), place);
-    phi::DenseTensor scale_share(_type);
-    scale_share.ShareDataWith(*scale);
-    phi::DenseTensor bias_share(_type);
-    bias_share.ShareDataWith(*bias);
-
-    int64_t N = y->dims()[0];
-    int64_t C, H, W;
-    framework::DDim scale_bias_dim;
-    if (data_layout == DataLayout::kNCHW) {
-      C = y->dims()[1];
-      H = y->dims()[2];
-      W = y->dims()[3];
-      scale_bias_dim = phi::make_ddim({C, 1, 1});
-    } else {
-      C = y->dims()[3];
-      H = y->dims()[1];
-      W = y->dims()[2];
-      scale_bias_dim = phi::make_ddim({1, 1, C});
-    }
-    scale_share.Resize(scale_bias_dim);
-    bias_share.Resize(scale_bias_dim);
-    F.Sub(y, &bias_share, &xnorm);
-    F.DivNoNan(&xnorm, &scale_share, &xnorm);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(place);
-      if (data_layout == DataLayout::kNCHW) {
-        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 2, 3}, false);
-      } else {
-        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 1, 2}, false);
-      }
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(place);
-      phi::DenseTensor dy_xnorm(_type);
-      dy_xnorm.mutable_data<T>(d_y->dims(), place);
-      F.Mul(d_y, &xnorm, &dy_xnorm);
-      if (data_layout == DataLayout::kNCHW) {
-        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 2, 3});
-      } else {
-        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 1, 2});
-      }
-    }
-
-    //  std = Sqrt(var+epsilon), init shape = [ N, G ]
-    phi::DenseTensor std(_type);
-    std.mutable_data<T>(var->dims(), place);
-    F.Adds(var, epsilon, &std);
-    F.Sqrt(&std, &std);
-    //  d_xnorm_std = dy_proc * scale / std
-    phi::DenseTensor d_xnorm_std(_type);
-    d_xnorm_std.mutable_data<T>(y->dims(), place);
-    F.Mul(d_y, &scale_share, &d_xnorm_std);
-    if (data_layout == DataLayout::kNCHW) {
-      xnorm.Resize({N, G, C * H * W / G});
-      d_xnorm_std.Resize({N, G, C * H * W / G});
-      std.Resize({N, G, 1});
-    } else {
-      xnorm.Resize({N, C * H * W / G, G});
-      d_xnorm_std.Resize({N, C * H * W / G, G});
-      std.Resize({N, 1, G});
-    }
-    F.Div(&d_xnorm_std, &std, &d_xnorm_std);
-
-    //  d_x = d_xnorm_std
-    //       - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm
-    //       - Mean ( d_xnorm_std, axis=1, keepdim=True )
-    d_x->mutable_data<T>(place);
-    d_x->Resize(xnorm.dims());
-    F.Mul(&d_xnorm_std, &xnorm, d_x);
-    phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
-    F.Mul(&dx1, &xnorm, d_x);
-
-    phi::DenseTensor dx2 =
-        F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
-
-    F.Sub(&d_xnorm_std, d_x, d_x);
-    F.Sub(d_x, &dx2, d_x);
-
-    d_x->Resize(y->dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(group_norm,
-                       ops::GroupNormNPUKernel<float>,
-                       ops::GroupNormNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(group_norm_grad,
-                       ops::GroupNormGradNPUKernel<float>,
-                       ops::GroupNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
deleted file mode 100644
index 4812dfa47dfed..0000000000000
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void HuberLossSub(const platform::Place& place,
-                  const aclrtStream& stream,
-                  const phi::DenseTensor* x,
-                  const phi::DenseTensor* y,
-                  phi::DenseTensor* z) {
-  //  Calculate z = x - y
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossMuls(const platform::Place& place,
-                   const aclrtStream& stream,
-                   const phi::DenseTensor* x,
-                   float scalar,
-                   phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossZerosLike(const platform::Place& place,
-                        const aclrtStream& stream,
-                        const phi::DenseTensor* x,
-                        phi::DenseTensor* y) {
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossSmoothL1Loss(const platform::Place& place,
-                           const aclrtStream& stream,
-                           const phi::DenseTensor* x,
-                           const phi::DenseTensor* y,
-                           float delta,
-                           phi::DenseTensor* z) {
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossSmoothL1LossGrad(const platform::Place& place,
-                               const aclrtStream& stream,
-                               const phi::DenseTensor* pred,
-                               const phi::DenseTensor* lab,
-                               const phi::DenseTensor* dout,
-                               float sigma,
-                               phi::DenseTensor* grad) {
-  grad->mutable_data<T>(pred->dims(), place);
-  const auto& runner = NpuOpRunner(
-      "SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class HuberLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<phi::DenseTensor>("X");
-    auto* in1 = ctx.Input<phi::DenseTensor>("Y");
-    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto delta = ctx.Attr<float>("delta");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-    HuberLossSub<T>(place, stream, in1, in0, residual);
-
-    HuberLossSmoothL1Loss<T>(place, stream, in0, in1, delta, out);
-    HuberLossMuls<T>(place, stream, out, delta, out);
-  }
-};
-
-template <typename T>
-class HuberLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto delta = ctx.Attr<float>("delta");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor t_grad_rd;
-    if (dx || dy) {
-      phi::DenseTensor t_zero;
-      HuberLossZerosLike<T>(place, stream, residual, &t_zero);
-      HuberLossSmoothL1LossGrad<T>(
-          place, stream, residual, &t_zero, dout, delta, &t_grad_rd);
-    }
-    if (dx) {
-      HuberLossMuls<T>(place, stream, &t_grad_rd, -delta, dx);
-    }
-    if (dy) {
-      HuberLossMuls<T>(place, stream, &t_grad_rd, delta, dy);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(huber_loss,
-                       ops::HuberLossNPUKernel<float>,
-                       ops::HuberLossNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(huber_loss_grad,
-                       ops::HuberLossGradNPUKernel<float>,
-                       ops::HuberLossGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
deleted file mode 100644
index 7188fe38fdc68..0000000000000
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class IncrementalNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<phi::DenseTensor>("X");
-    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
-    float step = context.Attr<float>("step");
-    out_tensor->mutable_data<T>(context.GetPlace());
-
-    Tensor step_tensor(x_tensor->dtype());
-
-    step_tensor.mutable_data<T>({1}, context.GetPlace());
-    FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
-
-    const auto& runner =
-        NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    increment,
-    paddle::operators::IncrementalNPUKernel<float>,
-    paddle::operators::IncrementalNPUKernel<double>,
-    paddle::operators::IncrementalNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::IncrementalNPUKernel<int64_t>,
-#endif
-    paddle::operators::IncrementalNPUKernel<paddle::platform::float16>)
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
deleted file mode 100644
index 2a77ff82d0fa3..0000000000000
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(increment);
-USE_OP_DEVICE_KERNEL(increment, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({1});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
-  auto op = f::OpRegistry::CreateOp(
-      "increment", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attr_input);
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
-  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
-}
-
-TEST(increment, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "increment");
-}
-
-TEST(increment, NPU_fp64) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<double>(&scope, *ctx, "increment");
-}
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
deleted file mode 100644
index 64a50041421b3..0000000000000
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename IndexT>
-void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
-                       const phi::DenseTensor* index,
-                       const phi::DenseTensor* input,
-                       phi::DenseTensor* out) {
-  auto index_dims = index->dims();
-  auto input_dims = input->dims();
-  auto batch_size = input_dims[0];
-  auto index_length = index_dims[1];
-
-  std::vector<IndexT> gather_index_vec;
-  std::vector<IndexT> index_vec;
-  framework::TensorToVector(*index, dev_ctx, &index_vec);
-  for (auto i = 0; i < batch_size; ++i) {
-    for (auto j = 0; j < index_length; j++) {
-      gather_index_vec.push_back(i);
-      gather_index_vec.push_back(index_vec[i * index_length + j]);
-    }
-  }
-  phi::DenseTensor gather_index;
-  framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index);
-  gather_index.Resize({batch_size, index_length, 2});
-
-  NpuOpRunner runner;
-  runner.SetType("GatherNd")
-      .AddInput(*input)
-      .AddInput(gather_index)
-      .AddOutput(*out);
-  runner.Run(dev_ctx.stream());
-}
-
-template <typename T>
-class IndexSampleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGather<int32_t>(dev_ctx, index, input, out);
-    } else {
-      IndexSampleGather<int64_t>(dev_ctx, index, input, out);
-    }
-  }
-};
-
-template <typename IndexT>
-void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
-                            const phi::DenseTensor* index,
-                            const phi::DenseTensor* out_grad,
-                            phi::DenseTensor* x_grad) {
-  auto index_dims = index->dims();
-  auto input_dims = x_grad->dims();
-  auto batch_size = input_dims[0];
-  auto index_length = index_dims[1];
-
-  std::vector<IndexT> scatter_index_vec;
-  std::vector<IndexT> index_vec;
-  framework::TensorToVector(*index, dev_ctx, &index_vec);
-  for (auto i = 0; i < batch_size; ++i) {
-    for (auto j = 0; j < index_length; j++) {
-      scatter_index_vec.push_back(i);
-      scatter_index_vec.push_back(index_vec[i * index_length + j]);
-    }
-  }
-  phi::DenseTensor scatter_index;
-  framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
-  scatter_index.Resize({batch_size, index_length, 2});
-
-  NpuOpRunner runner;
-  runner.SetType("ScatterNd")
-      .AddInput(scatter_index)
-      .AddInput(*out_grad)
-      .AddInput(phi::vectorize<IndexT>(x_grad->dims()))
-      .AddOutput(*x_grad);
-  runner.Run(dev_ctx.stream());
-}
-
-template <typename T>
-class IndexSampleGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradScatter<int32_t>(dev_ctx, index, out_grad, x_grad);
-    } else {
-      IndexSampleGradScatter<int64_t>(dev_ctx, index, out_grad, x_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(index_sample,
-                       ops::IndexSampleNPUKernel<plat::float16>,
-                       ops::IndexSampleNPUKernel<float>,
-                       ops::IndexSampleNPUKernel<int32_t>,
-                       ops::IndexSampleNPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(index_sample_grad,
-                       ops::IndexSampleGradNPUKernel<plat::float16>,
-                       ops::IndexSampleGradNPUKernel<float>,
-                       ops::IndexSampleGradNPUKernel<int32_t>,
-                       ops::IndexSampleGradNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
deleted file mode 100644
index dd9c5608a0469..0000000000000
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IndexSelectNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto dim = ctx.Attr<int>("dim");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.SetType("GatherV2")
-        .AddInput(*x)
-        .AddInput(*index)
-        .AddInput(std::vector<int32_t>{dim})
-        .AddOutput(*out);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto x_dims = x_grad->dims();
-    auto out_dims = out_grad->dims();
-
-    int dim = ctx.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_dims.size();
-    }
-
-    phi::DenseTensor casted_index;
-    if (framework::TransToProtoVarType(index->dtype()) !=
-        framework::proto::VarType::INT32) {
-      casted_index.mutable_data<int32_t>(index->dims(), ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*index}, {casted_index}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(stream);
-    } else {
-      casted_index.ShareDataWith(*index);
-    }
-
-    if (dim == 0) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& zeros_runner = NpuOpRunner("ZerosLike", {*x_grad}, {*x_grad});
-      zeros_runner.Run(stream);
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(*out_grad)
-          .AddInput(casted_index)
-          .AddInput(std::vector<int64_t>{x_dims[dim]})
-          .AddOutput(*x_grad);
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transed_out_grad;
-      std::vector<int> in_trans_perm;
-      in_trans_perm.push_back(dim);
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (i == dim) continue;
-        in_trans_perm.push_back(i);
-      }
-      framework::DDim transed_out_dims(out_dims);
-      for (size_t i = 0; i < in_trans_perm.size(); ++i) {
-        transed_out_dims[i] = out_dims[in_trans_perm[i]];
-      }
-      transed_out_grad.mutable_data<T>(transed_out_dims, ctx.GetPlace());
-      NpuOpRunner in_trans_runner;
-      in_trans_runner.SetType("Transpose")
-          .AddInput(*out_grad)
-          .AddInput(std::move(in_trans_perm))
-          .AddOutput(transed_out_grad);
-      in_trans_runner.Run(stream);
-
-      phi::DenseTensor sum_out;
-      framework::DDim sum_dims(x_dims);
-      sum_dims[0] = x_dims[dim];
-      auto idx = 1;
-      for (int i = 0; i < x_dims.size(); ++i) {
-        if (i == dim) continue;
-        sum_dims[idx++] = x_dims[i];
-      }
-      sum_out.mutable_data<T>(sum_dims, ctx.GetPlace());
-      const auto& zeros_runner = NpuOpRunner("ZerosLike", {sum_out}, {sum_out});
-      zeros_runner.Run(stream);
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(transed_out_grad)
-          .AddInput(casted_index)
-          .AddInput(std::vector<int64_t>{x_dims[dim]})
-          .AddOutput(sum_out);
-      runner.Run(stream);
-
-      std::vector<int> out_trans_perm;
-      for (int i = 1; i < 1 + dim; ++i) {
-        out_trans_perm.push_back(i);
-      }
-      out_trans_perm.push_back(0);
-      for (int i = 1 + dim; i < x_dims.size(); ++i) {
-        out_trans_perm.push_back(i);
-      }
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner out_trans_runner;
-      out_trans_runner.SetType("Transpose")
-          .AddInput(sum_out)
-          .AddInput(std::move(out_trans_perm))
-          .AddOutput(*x_grad);
-      out_trans_runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    index_select,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
deleted file mode 100644
index 03307895f09e2..0000000000000
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class InstanceNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto* variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    dev_ctx.template Alloc<T>(y);
-    dev_ctx.template Alloc<T>(mean);
-    dev_ctx.template Alloc<T>(variance);
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    PADDLE_ENFORCE(x_dims.size() <= 5 && x_dims.size() >= 3,
-                   platform::errors::InvalidArgument(
-                       "InstanceNorm only supports the dimension of input "
-                       " less equal to 5 and greater equal to 3. the dimension "
-                       "of input is %d.",
-                       x_dims.size()));
-
-    auto tmp_x_dims = phi::vectorize<int>(x_dims);
-    auto tmp_y_dims = phi::vectorize<int>(y_dims);
-    if (x_dims.size() < 5) {
-      for (size_t i = x_dims.size(); i < 5; ++i) {
-        tmp_x_dims.insert(tmp_x_dims.begin() + 2, 1);
-        tmp_y_dims.insert(tmp_y_dims.begin() + 2, 1);
-      }
-    }
-
-    phi::DenseTensor tmp_x, tmp_y;
-    tmp_x.ShareDataWith(*x);
-
-    tmp_x.Resize(phi::make_ddim(tmp_x_dims));
-    tmp_x.set_layout(phi::DataLayout::NCDHW);
-    tmp_y.ShareDataWith(*y);
-    tmp_y.Resize(phi::make_ddim(tmp_y_dims));
-    tmp_y.set_layout(phi::DataLayout::NCDHW);
-
-    NpuOpRunner runner;
-
-    runner.SetType("InstanceNorm")
-        .AddInput(tmp_x)
-        .AddInput(*scale)
-        .AddInput(*bias)
-        .AddAttr("data_format", std::string("NCDHW"))
-        .AddAttr("epsilon", epsilon)
-        .AddOutput(tmp_y)
-        .AddOutput(*mean)
-        .AddOutput(*variance);
-    runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext,
-                               plat::float16>,
-    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
deleted file mode 100644
index 108efafff683f..0000000000000
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/interpolate_op.h"
-
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-
-inline static void CheckArgument(const framework::ExecutionContext& ctx) {
-  const std::string interp_method = ctx.Attr<std::string>("interp_method");
-#if (CANN_VERSION_CODE < 512000)
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  PADDLE_ENFORCE_EQ(
-      align_corners,
-      false,
-      platform::errors::InvalidArgument(
-          "NPU Interpolate Kernel has diff when align_corners is true."));
-#endif
-  PADDLE_ENFORCE_EQ(
-      interp_method,
-      "nearest",
-      platform::errors::InvalidArgument(
-          "NPU Interpolate Kernel only support nearest interpolotion."));
-}
-
-inline static void ExtractNCHW(const framework::DDim& dims,
-                               const DataLayout& data_layout,
-                               int32_t* n,
-                               int32_t* c,
-                               int32_t* h,
-                               int32_t* w) {
-  *n = dims[0];
-  if (data_layout == DataLayout::kNCHW) {
-    *c = dims[1];
-    *h = dims[2];
-    *w = dims[3];
-  } else {  // kNHWC
-    *h = dims[1];
-    *w = dims[2];
-    *c = dims[3];
-  }
-}
-
-static void CalcOutSize(const framework::ExecutionContext& ctx,
-                        int32_t in_h,
-                        int32_t in_w,
-                        int32_t* out_h,
-                        int32_t* out_w) {
-  // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-  *out_h = ctx.Attr<int>("out_h");
-  *out_w = ctx.Attr<int>("out_w");
-
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-
-  if (list_new_size_tensor.size() > 0) {
-    std::vector<int32_t> new_size_h(1);
-    std::vector<int32_t> new_size_w(1);
-    framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &new_size_h);
-    framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &new_size_w);
-    *out_h = new_size_h[0];
-    *out_w = new_size_w[0];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-    if (scale_tensor != nullptr) {
-      std::vector<float> scale_data;
-      framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-
-    if (scale > 0) {
-      *out_h = static_cast<int32_t>(in_h * scale);
-      *out_w = static_cast<int32_t>(in_w * scale);
-    }
-
-    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-    if (out_size != nullptr) {
-      std::vector<int> out_size_data;
-      framework::TensorToVector(*out_size, *dev_ctx, &out_size_data);
-      *out_h = out_size_data[0];
-      *out_w = out_size_data[1];
-    }
-  }
-
-  PADDLE_ENFORCE_GT(*out_h,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "out_h in Attr(out_shape) of Op(interpolate) "
-                        "should be greater than 0."));
-  PADDLE_ENFORCE_GT(*out_w,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "out_w in Attr(out_shape) of Op(interpolate) "
-                        "should be greater than 0."));
-}
-
-template <typename T>
-class InterpolateNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // NOTE(Ruibiao):
-    // this kernel only support nearest interpolotion for 2D images
-    // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff
-    // when 'align_corners' is 'true' or data type is 'double'
-    CheckArgument(ctx);
-
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    framework::DDim input_dims = input->dims();
-
-    const std::string data_layout_str =
-        ctx.Attr<std::string>("data_layout");  // kNCHW or kNHWC
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    int32_t n, c, h, w, out_h, out_w;
-    ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w);
-    CalcOutSize(ctx, h, w, &out_h, &out_w);
-
-    // the 'input' tensor may has no set (or wrong set) of the layout
-    phi::DenseTensor input_x(input->type());
-    input_x.ShareDataWith(*input);
-    input_x.set_layout(data_layout);
-
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    framework::DDim output_dims;
-    if (data_layout == DataLayout::kNCHW) {
-      output_dims = {n, c, out_h, out_w};
-    } else {
-      output_dims = {n, out_h, out_w, c};
-    }
-    output->set_layout(data_layout);
-    output->mutable_data<T>(output_dims, ctx.GetPlace());
-
-    NpuOpRunner npu_op_runner;
-    auto npu_stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    npu_op_runner.SetType("ResizeNearestNeighborV2")
-        .AddInput(input_x)
-        .AddInput(std::vector<int32_t>{out_h, out_w})
-        .AddOutput(*output)
-        .AddAttr("align_corners", false)
-        .AddAttr("half_pixel_centers", false)
-        .Run(npu_stream);
-  }
-};
-
-template <typename T>
-class InterpolateGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // NOTE(Ruibiao):
-    // this kernel only support nearest interpolotion for 2D images
-    // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff
-    // when 'align_corners' is 'true' or data type is 'double'
-    CheckArgument(ctx);
-
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    framework::DDim input_dims = input->dims();
-
-    const std::string data_layout_str =
-        ctx.Attr<std::string>("data_layout");  // kNCHW or kNHWC
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    int32_t n, c, h, w, out_h, out_w;
-    ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w);
-    CalcOutSize(ctx, h, w, &out_h, &out_w);
-
-    // the 'output_grad' tensor may has no set (or wrong set) of the layout
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor output_grad_tmp(output_grad->type());
-    output_grad_tmp.ShareDataWith(*output_grad);
-    output_grad_tmp.set_layout(data_layout);
-
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->set_layout(data_layout);
-    framework::DDim input_grad_dims;
-    if (data_layout == DataLayout::kNCHW) {
-      input_grad_dims = {n, c, h, w};
-    } else {
-      input_grad_dims = {n, h, w, c};
-    }
-    input_grad->mutable_data<T>(input_grad_dims, ctx.GetPlace());
-
-    NpuOpRunner npu_op_runner;
-    auto npu_stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    npu_op_runner.SetType("ResizeNearestNeighborV2Grad")
-        .AddInput(output_grad_tmp)
-        .AddInput(std::vector<int32_t>{h, w})
-        .AddOutput(*input_grad)
-        .AddAttr("align_corners", false)
-        .AddAttr("half_pixel_centers", false)
-        .Run(npu_stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(nearest_interp,
-                       ops::InterpolateNPUKernel<float>,
-                       ops::InterpolateNPUKernel<uint8_t>);
-REGISTER_OP_NPU_KERNEL(nearest_interp_grad,
-                       ops::InterpolateGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
deleted file mode 100644
index d16494f229e42..0000000000000
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ /dev/null
@@ -1,812 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/interpolate_function.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-using DDim = framework::DDim;
-using fp16 = paddle::platform::float16;
-
-template <typename T>
-struct InterpolateFunction {
- public:
-  explicit InterpolateFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-    t0.mutable_data<float>({1}, place);
-    t1.mutable_data<float>({1}, place);
-    tn.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
-    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
-  }
-  void Arange(int n, phi::DenseTensor* x) {
-    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
-    runner.Run(stream);
-  }
-  void ReduceSum(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& dim,
-                 bool keep_dims = true) {
-    const auto& runner = NpuOpRunner(
-        "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(y->dtype()));
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner.Run(stream);
-  }
-  void Gather(const phi::DenseTensor* x,
-              const phi::DenseTensor* indices,
-              const int axis,
-              phi::DenseTensor* y) {
-    const auto& runner =
-        NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}});
-    runner.Run(stream);
-  }
-  void GatherGrad(const phi::DenseTensor* gy,
-                  const phi::DenseTensor* indices,
-                  const int axis,
-                  phi::DenseTensor* gx) {
-    //  1  gy swapaxis: axis & 0
-    int len = (gy->dims()).size();
-    std::vector<int> axis_swap(len);
-    for (int i = 0; i < len; i++) {
-      axis_swap[i] = i;
-    }
-    axis_swap[0] = axis;
-    axis_swap[axis] = 0;
-    auto y_new_shape = gy->dims();
-    auto yt = y_new_shape[axis];
-    y_new_shape[axis] = y_new_shape[0];
-    y_new_shape[0] = yt;
-    phi::DenseTensor gy_t;
-    gy_t.mutable_data<T>(y_new_shape, place);
-    Transpose(gy, &gy_t, axis_swap);
-    //  2  scatter
-    auto x_new_shape = gx->dims();
-    auto xt = x_new_shape[axis];
-    x_new_shape[axis] = x_new_shape[0];
-    x_new_shape[0] = xt;
-    phi::DenseTensor gx_zero, gx_t;
-    gx_zero.mutable_data<T>(x_new_shape, place);
-    gx_t.mutable_data<T>(x_new_shape, place);
-    FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
-    gx_zero.Resize(x_new_shape);
-    Scatter(&gx_zero, indices, &gy_t, &gx_t);
-    //  3  gx swapaxis: axis, 0
-    Transpose(&gx_t, gx, axis_swap);
-  }
-  void Scatter(const phi::DenseTensor* x,
-               const phi::DenseTensor* index,
-               const phi::DenseTensor* updates,
-               phi::DenseTensor* y) {
-    const auto& runner =
-        NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {});
-    runner.Run(stream);
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Floor(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {});
-    runner.Run(stream);
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-  phi::DenseTensor t0;
-  phi::DenseTensor t1;
-  phi::DenseTensor tn;
-};
-
-template <>
-void InterpolateFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  phi::DenseTensor x_fp32(phi::DataType::FLOAT32);
-  x_fp32.mutable_data<float>(x->dims(), place);
-  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
-  runner.Run(stream);
-  Cast(&x_fp32, x);
-}
-
-void InterpolateParamCompute(const float scale_h,
-                             const float scale_w,
-                             const bool align_corners,
-                             const int align_mode,
-                             const DataLayout& data_layout,
-                             const DDim& indim,
-                             const DDim& outdim,
-                             int* axis_h,
-                             int* axis_w,
-                             int* in_h,
-                             int* in_w,
-                             int* out_h,
-                             int* out_w,
-                             float* ratio_h,
-                             float* ratio_w) {
-  if (data_layout == DataLayout::kNCHW) {
-    *axis_h = 2;
-    *axis_w = 3;
-  } else {
-    *axis_h = 1;
-    *axis_w = 2;
-  }
-  *out_h = outdim[*axis_h];
-  *out_w = outdim[*axis_w];
-  *in_h = indim[*axis_h];
-  *in_w = indim[*axis_w];
-  *ratio_h = 0.0f;
-  *ratio_w = 0.0f;
-  if (*out_h > 1) {
-    *ratio_h =
-        align_corners
-            ? static_cast<float>(*in_h - 1) / (*out_h - 1)
-            : (scale_h > 0 ? 1 / scale_h : static_cast<float>(*in_h) / *out_h);
-  }
-  if (*out_w > 1) {
-    *ratio_w =
-        align_corners
-            ? static_cast<float>(*in_w - 1) / (*out_w - 1)
-            : (scale_w > 0 ? 1 / scale_w : static_cast<float>(*in_w) / *out_w);
-  }
-}
-
-template <typename T>
-void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
-                                const DataLayout& data_layout,
-                                int in_h,
-                                int in_w,
-                                int out_h,
-                                int out_w,
-                                bool align_cond,
-                                float ratio_h,
-                                float ratio_w,
-                                phi::DenseTensor* h0,
-                                phi::DenseTensor* h1,
-                                phi::DenseTensor* w0,
-                                phi::DenseTensor* w1,
-                                phi::DenseTensor* coef_h0,
-                                phi::DenseTensor* coef_h1,
-                                phi::DenseTensor* coef_w0,
-                                phi::DenseTensor* coef_w1) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  phi::DenseTensor _h0, _w0;
-  _h0.mutable_data<T>({out_h}, place);
-  _w0.mutable_data<T>({out_w}, place);
-  F.Arange(out_h, &_h0);
-  F.Arange(out_w, &_w0);
-  if (align_cond) {
-    F.Adds(&_h0, static_cast<float>(0.5), &_h0);
-    F.Adds(&_w0, static_cast<float>(0.5), &_w0);
-    F.Muls(&_h0, ratio_h, &_h0);
-    F.Muls(&_w0, ratio_w, &_w0);
-    F.Adds(&_h0, static_cast<float>(-0.5), &_h0);
-    F.Adds(&_w0, static_cast<float>(-0.5), &_w0);
-  } else {
-    F.Muls(&_h0, ratio_h, &_h0);
-    F.Muls(&_w0, ratio_w, &_w0);
-  }
-
-  phi::DenseTensor zero_t;
-  phi::DenseTensor one_t;
-  zero_t.mutable_data<T>({1}, place);
-  one_t.mutable_data<T>({1}, place);
-  FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
-  FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
-  F.Maximum(&_h0, &zero_t, &_h0);
-  F.Maximum(&_w0, &zero_t, &_w0);
-
-  phi::DenseTensor _h0_floor, _w0_floor;
-  _h0_floor.mutable_data<T>({out_h}, place);
-  _w0_floor.mutable_data<T>({out_w}, place);
-  F.Floor(&_h0, &_h0_floor);
-  F.Floor(&_w0, &_w0_floor);
-  F.Cast(&_h0_floor, h0);
-  F.Cast(&_w0_floor, w0);
-
-  phi::DenseTensor one_int;
-  one_int.mutable_data<int>({1}, place);
-  FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
-  F.Add(h0, &one_int, h1);
-  F.Add(w0, &one_int, w1);
-  phi::DenseTensor t_max_h, t_max_w;
-  t_max_h.mutable_data<int>({1}, place);
-  t_max_w.mutable_data<int>({1}, place);
-  FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
-  FillNpuTensorWithConstant<int>(&t_max_w, static_cast<int>(in_w - 1));
-  F.Minimum(h1, &t_max_h, h1);
-  F.Minimum(w1, &t_max_w, w1);
-
-  F.Sub(&_h0, &_h0_floor, coef_h1);
-  F.Sub(&_w0, &_w0_floor, coef_w1);
-  F.Sub(&one_t, coef_h1, coef_h0);
-  F.Sub(&one_t, coef_w1, coef_w0);
-
-  if (data_layout == DataLayout::kNCHW) {
-    coef_h0->Resize({out_h, 1});
-    coef_h1->Resize({out_h, 1});
-  } else {
-    coef_h0->Resize({out_h, 1, 1});
-    coef_h1->Resize({out_h, 1, 1});
-    coef_w0->Resize({out_w, 1});
-    coef_w1->Resize({out_w, 1});
-  }
-}
-
-template <typename T>
-void BilinearFwdNpu(const framework::ExecutionContext& ctx,
-                    const phi::DenseTensor* input,
-                    phi::DenseTensor* output,
-                    const float scale_h,
-                    const float scale_w,
-                    const bool align_corners,
-                    const int align_mode,
-                    const DataLayout& data_layout) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  auto outdim = output->dims();
-  auto indim = input->dims();
-
-  int axis_h, axis_w;
-  int out_h, out_w, in_h, in_w;
-  float ratio_h, ratio_w;
-  InterpolateParamCompute(scale_h,
-                          scale_w,
-                          align_corners,
-                          align_mode,
-                          data_layout,
-                          indim,
-                          outdim,
-                          &axis_h,
-                          &axis_w,
-                          &in_h,
-                          &in_w,
-                          &out_h,
-                          &out_w,
-                          &ratio_h,
-                          &ratio_w);
-
-  phi::DenseTensor h0, h1, w0, w1;
-  h0.mutable_data<int>({out_h}, place);
-  h1.mutable_data<int>({out_h}, place);
-  w0.mutable_data<int>({out_w}, place);
-  w1.mutable_data<int>({out_w}, place);
-  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
-  coef_h0.mutable_data<T>({out_h}, place);
-  coef_h1.mutable_data<T>({out_h}, place);
-  coef_w0.mutable_data<T>({out_w}, place);
-  coef_w1.mutable_data<T>({out_w}, place);
-  bool align_cond = align_mode == 0 && !align_corners;
-  BilinearParamTensorCompute<T>(ctx,
-                                data_layout,
-                                in_h,
-                                in_w,
-                                out_h,
-                                out_w,
-                                align_cond,
-                                ratio_h,
-                                ratio_w,
-                                &h0,
-                                &h1,
-                                &w0,
-                                &w1,
-                                &coef_h0,
-                                &coef_h1,
-                                &coef_w0,
-                                &coef_w1);
-
-  phi::DenseTensor input_gather_h0, input_gather_h1;
-  auto dim_gather_h = indim;
-  dim_gather_h[axis_h] = out_h;
-  input_gather_h0.mutable_data<T>(dim_gather_h, place);
-  input_gather_h1.mutable_data<T>(dim_gather_h, place);
-
-  F.Gather(input, &h0, axis_h, &input_gather_h0);
-  F.Gather(input, &h1, axis_h, &input_gather_h1);
-
-  F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
-  F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
-  phi::DenseTensor out_x4;
-  out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
-                         place);
-  phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1);
-  phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2);
-  phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3);
-  phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4);
-  F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
-  F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
-  F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
-  F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1);
-  F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0);
-  F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1);
-  F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0);
-  F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1);
-  F.ReduceSum(&out_x4, output, std::vector<int>{0}, false);
-}
-
-template <typename T>
-void BilinearBwdNpu(const framework::ExecutionContext& ctx,
-                    const phi::DenseTensor* gout,
-                    phi::DenseTensor* gin,
-                    const float scale_h,
-                    const float scale_w,
-                    const bool align_corners,
-                    const int align_mode,
-                    const DataLayout& data_layout) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  auto outdim = gout->dims();
-  auto indim = gin->dims();
-
-  int axis_h, axis_w;
-  int out_h, out_w, in_h, in_w;
-  float ratio_h, ratio_w;
-  InterpolateParamCompute(scale_h,
-                          scale_w,
-                          align_corners,
-                          align_mode,
-                          data_layout,
-                          indim,
-                          outdim,
-                          &axis_h,
-                          &axis_w,
-                          &in_h,
-                          &in_w,
-                          &out_h,
-                          &out_w,
-                          &ratio_h,
-                          &ratio_w);
-
-  phi::DenseTensor h0, h1, w0, w1;
-  h0.mutable_data<int>({out_h}, place);
-  h1.mutable_data<int>({out_h}, place);
-  w0.mutable_data<int>({out_w}, place);
-  w1.mutable_data<int>({out_w}, place);
-  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
-  coef_h0.mutable_data<T>({out_h}, place);
-  coef_h1.mutable_data<T>({out_h}, place);
-  coef_w0.mutable_data<T>({out_w}, place);
-  coef_w1.mutable_data<T>({out_w}, place);
-  bool align_cond = align_mode == 0 && !align_corners;
-  BilinearParamTensorCompute<T>(ctx,
-                                data_layout,
-                                in_h,
-                                in_w,
-                                out_h,
-                                out_w,
-                                align_cond,
-                                ratio_h,
-                                ratio_w,
-                                &h0,
-                                &h1,
-                                &w0,
-                                &w1,
-                                &coef_h0,
-                                &coef_h1,
-                                &coef_w0,
-                                &coef_w1);
-
-  phi::DenseTensor gy_w0, gy_w1;
-  gy_w0.mutable_data<T>(outdim, place);
-  gy_w1.mutable_data<T>(outdim, place);
-  F.Mul(gout, &coef_w0, &gy_w0);
-  F.Mul(gout, &coef_w1, &gy_w1);
-
-  auto dim_gather_h = indim;
-  dim_gather_h[axis_h] = out_h;
-  phi::DenseTensor g_gather_w0, g_gather_w1;
-  g_gather_w0.mutable_data<T>(dim_gather_h, place);
-  g_gather_w1.mutable_data<T>(dim_gather_h, place);
-  w0.Resize({out_w, 1});
-  w1.Resize({out_w, 1});
-  F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0);
-  F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1);
-
-  F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0);
-  F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
-  F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
-
-  phi::DenseTensor gx_0, gx_1;
-  gx_0.mutable_data<T>(indim, place);
-  gx_1.mutable_data<T>(indim, place);
-  h0.Resize({out_h, 1});
-  h1.Resize({out_h, 1});
-  F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0);
-  F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1);
-
-  F.Add(&gx_0, &gx_1, gin);
-}
-
-template <typename DeviceContext, typename T>
-class InterpolateV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-
-    auto input_dims = input->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(),
-        4UL,
-        platform::errors::External(
-            "NPU Interpolate Kernel only support 4-D phi::DenseTensor."));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    phi::funcs::ExtractNCDWH(
-        input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-
-    // To-do(qili93): need to support align_corners = true case, try ReSizeD
-    PADDLE_ENFORCE_EQ(
-        align_corners,
-        false,
-        platform::errors::InvalidArgument(
-            "NPU Interpolate Kernel has diff when align_corners is true."));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_h = -1;
-    float scale_w = -1;
-
-    // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_shape_tensor.size() > 0) {
-      std::vector<int32_t> output_h(1);
-      std::vector<int32_t> output_w(1);
-      auto dev_ctx =
-          platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-      framework::TensorToVector(*list_new_shape_tensor[0], *dev_ctx, &output_h);
-      framework::TensorToVector(*list_new_shape_tensor[1], *dev_ctx, &output_w);
-      out_h = output_h[0];
-      out_w = output_w[0];
-    } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        auto scale_data =
-            phi::funcs::get_new_data_from_tensor<float>(scale_tensor);
-        if (scale_data.size() > 1) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else {
-          scale_h = scale_data[0];
-          scale_w = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_w));
-        PADDLE_ENFORCE_EQ(
-            scale_h > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_h));
-      } else {
-        if (scale.size() > 1) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_w in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_w));
-          PADDLE_ENFORCE_EQ(
-              scale_h > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_h in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_h));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-    }
-    PADDLE_ENFORCE_GT(out_h,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "out_h in Attr(out_shape) of Op(interpolate) "
-                          "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "out_w in Attr(out_shape) of Op(interpolate) "
-                          "should be greater than 0."));
-    framework::DDim dim_out;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_out = {n, c, out_h, out_w};
-    } else {
-      dim_out = {n, out_h, out_w, c};
-    }
-    output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-    if (in_h == out_h && in_w == out_w) {
-      framework::TensorCopy(*input, ctx.GetPlace(), output);
-      return;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // To-do(qili93): need to support bilineare, try ResizeD
-    // Add bilineare by zhulei
-    if ("nearest" == interp_method) {
-      NpuOpRunner runner;
-      runner.SetType("ResizeNearestNeighborV2")
-          .AddInput(*input)
-          .AddInput(std::vector<int32_t>{out_h, out_w})
-          .AddOutput(*output)
-          .AddAttr("align_corners", align_corners)
-          .AddAttr("half_pixel_centers", false);
-      runner.Run(stream);
-    } else if ("bilinear" == interp_method) {
-      int align_mode = ctx.Attr<int>("align_mode");
-      BilinearFwdNpu<T>(ctx,
-                        input,
-                        output,
-                        scale_h,
-                        scale_w,
-                        align_corners,
-                        align_mode,
-                        data_layout);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    phi::funcs::ExtractNCDWH(
-        input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-
-    // To-do(qili93): need to support align_corners = true case, try ReSizeD
-    PADDLE_ENFORCE_EQ(
-        align_corners,
-        false,
-        platform::errors::InvalidArgument(
-            "NPU Interpolate Kernel has diff when align_corners is true."));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_h = -1;
-    float scale_w = -1;
-
-    // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_size_tensor.size() > 0) {
-      std::vector<int32_t> output_h(1);
-      std::vector<int32_t> output_w(1);
-      auto dev_ctx =
-          platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-      framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &output_h);
-      framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &output_w);
-      out_h = output_h[0];
-      out_w = output_w[0];
-    } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        auto scale_data =
-            phi::funcs::get_new_data_from_tensor<float>(scale_tensor);
-        if (scale_data.size() > 1) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else {
-          scale_w = scale_data[0];
-          scale_h = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_w));
-        PADDLE_ENFORCE_EQ(
-            scale_h > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_h));
-      } else {
-        if (scale.size() > 1) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_w in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_w));
-          PADDLE_ENFORCE_EQ(
-              scale_h > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_h in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_h));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-    }
-
-    framework::DDim dim_grad;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_grad = {n, c, in_h, in_w};
-    } else {
-      dim_grad = {n, in_h, in_w, c};
-    }
-
-    input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-
-    if (in_h == out_h && in_w == out_w) {
-      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
-      return;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // To-do(qili93): need to support bilineare, try ResizeGradD
-    if ("nearest" == interp_method) {
-      NpuOpRunner runner;
-      runner.SetType("ResizeNearestNeighborV2Grad")
-          .AddInput(*output_grad)
-          .AddInput(std::vector<int32_t>{in_h, in_w})
-          .AddOutput(*input_grad)
-          .AddAttr("align_corners", align_corners)
-          .AddAttr("half_pixel_centers", false);
-      runner.Run(stream);
-    } else if ("bilinear" == interp_method) {
-      int align_mode = ctx.Attr<int>("align_mode");
-      BilinearBwdNpu<T>(ctx,
-                        output_grad,
-                        input_grad,
-                        scale_h,
-                        scale_w,
-                        align_corners,
-                        align_mode,
-                        data_layout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    nearest_interp_v2,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    nearest_interp_v2_grad,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bilinear_interp_v2,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bilinear_interp_v2_grad,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc
deleted file mode 100644
index 91a0698d626f5..0000000000000
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    is_empty,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
deleted file mode 100644
index d2b4626c58cb4..0000000000000
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class KLDivLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* target = ctx.Input<phi::DenseTensor>("Target");
-    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    loss->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if ("none" == reduction) {
-      // log(label)
-      auto ones_tensor = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& ones_runner =
-          NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {});
-      ones_runner.Run(stream);
-
-      auto sub_tensor = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& sub_runner =
-          NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {});
-      sub_runner.Run(stream);
-
-      auto log_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& log_runner =
-          NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {});
-      log_runner.Run(stream);
-
-      // log(label) - input
-      const auto& sub_runner2 =
-          NpuOpRunner("Sub", {log_target, *input}, {*loss}, {});
-      sub_runner2.Run(stream);
-
-      // label * (log(label) - input)
-      auto min_value =
-          ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-      auto max_value =
-          ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-      FillNpuTensorWithConstant(&min_value, static_cast<T>(0));
-      FillNpuTensorWithConstant(&max_value, std::numeric_limits<T>::max());
-
-      auto cliped_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& clip_runner = NpuOpRunner(
-          "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {});
-      clip_runner.Run(stream);
-
-      const auto& mul_runner =
-          NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {});
-      mul_runner.Run(stream);
-    } else if ("batchmean" == reduction || "sum" == reduction) {
-      const auto& runner = NpuOpRunner(
-          "KLDiv", {*input, *target}, {*loss}, {{"reduction", reduction}});
-      runner.Run(stream);
-    } else if ("mean" == reduction) {
-      const auto& runner = NpuOpRunner("KLDiv",
-                                       {*input, *target},
-                                       {*loss},
-                                       {{"reduction", std::string("sum")}});
-      runner.Run(stream);
-
-      const int numel = input->numel();
-      const auto& muls_runner =
-          NpuOpRunner("Muls",
-                      {*loss},
-                      {*loss},
-                      {{"value", static_cast<float>(1.0 / numel)}});
-      muls_runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class KLDivLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* target = ctx.Input<phi::DenseTensor>("Target");
-    auto* loss_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto reduction = ctx.Attr<std::string>("reduction");
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    phi::DenseTensor loss_grad_transformed;
-    if ("none" == reduction) {
-      loss_grad_transformed.ShareDataWith(*loss_grad);
-    } else {
-      loss_grad_transformed.mutable_data<T>(input_grad->dims(), ctx.GetPlace());
-
-      NpuOpRunner broadcast_runner;
-      broadcast_runner.SetType("BroadcastTo");
-      broadcast_runner.AddInput(*loss_grad);
-      broadcast_runner.AddInput(phi::vectorize<int>(input_grad->dims()));
-      broadcast_runner.AddOutput(loss_grad_transformed);
-      broadcast_runner.Run(stream);
-    }
-    auto min_value =
-        ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-    auto max_value =
-        ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-    FillNpuTensorWithConstant(&min_value, static_cast<T>(0));
-    FillNpuTensorWithConstant(&max_value, std::numeric_limits<T>::max());
-
-    auto cliped_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-        target->dims(), dev_ctx);
-    const auto& clip_runner = NpuOpRunner(
-        "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {});
-    clip_runner.Run(stream);
-
-    const auto& mul_runner = NpuOpRunner(
-        "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {});
-    mul_runner.Run(stream);
-
-    float k = -1.0f;
-
-    if ("mean" == reduction) {
-      k = static_cast<float>(-1.0 / input_grad->numel());
-    } else if ("batchmean" == reduction) {
-      k = static_cast<float>(-1.0 / input_grad->dims()[0]);
-    }
-
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}});
-    muls_runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(kldiv_loss,
-                       ops::KLDivLossNPUKernel<float>,
-                       ops::KLDivLossNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(kldiv_loss_grad,
-                       ops::KLDivLossGradNPUKernel<float>,
-                       ops::KLDivLossGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
deleted file mode 100644
index 5c267625f55f7..0000000000000
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LabelSmoothMuls(const platform::Place& place,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor* in,
-                     float val,
-                     phi::DenseTensor* out) {
-  out->mutable_data<T>(in->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LabelSmoothAdds(const platform::Place& place,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor* in,
-                     float val,
-                     phi::DenseTensor* out) {
-  out->mutable_data<T>(in->dims(), place);
-  const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LabelSmoothAddBroadCast(const platform::Place& place,
-                             const aclrtStream& stream,
-                             const phi::DenseTensor* in1,
-                             const phi::DenseTensor* in2,
-                             phi::DenseTensor* out) {
-  out->mutable_data<T>(place);
-  const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-class LabelSmoothNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto* in_t = ctx.Input<phi::DenseTensor>("X");
-    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
-    auto epsilon = ctx.Attr<float>("epsilon");
-
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dist_t) {
-      phi::DenseTensor tmp;
-      phi::DenseTensor dist;
-      phi::DenseTensor tmp2;
-      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
-      LabelSmoothMuls<T>(place, stream, dist_t, epsilon, &tmp2);
-      tmp2.Resize({1, label_dim});
-      LabelSmoothAddBroadCast<T>(place, stream, &tmp, &tmp2, out_t);
-    } else {
-      phi::DenseTensor tmp;
-      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
-      LabelSmoothAdds<T>(place, stream, &tmp, (epsilon / label_dim), out_t);
-    }
-  }
-};
-
-template <typename T>
-class LabelSmoothGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto epsilon = ctx.Attr<float>("epsilon");
-
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    LabelSmoothMuls<T>(place, stream, d_out_t, 1 - epsilon, d_in_t);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(label_smooth,
-                       ops::LabelSmoothNPUKernel<float>,
-                       ops::LabelSmoothNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(label_smooth_grad,
-                       ops::LabelSmoothGradNPUKernel<float>,
-                       ops::LabelSmoothGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
deleted file mode 100644
index ca6762f2e325a..0000000000000
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = framework::DDim;
-
-using DataLayout = phi::DataLayout;
-
-template <typename T>
-class NormDataType;
-
-template <>
-class NormDataType<platform::float16> {
- public:
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-};
-
-template <>
-class NormDataType<float> {
- public:
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-};
-
-template <typename T>
-using NormDataType = NormDataType<T>;
-template <typename T>
-using LayerNormParamType = typename NormDataType<T>::BatchNormParamType;
-
-template <typename T>
-class LayerNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using U = LayerNormParamType<T>;
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
-    const auto& x_dims = x->dims();
-    std::vector<int> axes;
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // The shape of scale and bias should be equal to x.shape[begin_norm_axis:],
-    // required by Ascend.
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      axes.push_back(x_dims[i]);
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor default_scale(x->type());
-    if (!scale) {
-      default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
-      runner.Run(stream);
-      scale = &default_scale;
-    } else {
-      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
-    }
-
-    phi::DenseTensor default_bias(x->type());
-    if (!bias) {
-      default_bias.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
-      runner.Run(stream);
-      bias = &default_bias;
-    } else {
-      const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim(axes));
-    }
-
-    // cast scale from LayerNormParamType to T if needed
-    phi::DenseTensor cast_scale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(scale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_scale.Resize(scale->dims());
-      cast_scale.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_scale =
-          NpuOpRunner("Cast",
-                      {*scale},
-                      {cast_scale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_scale.Run(stream);
-    } else {
-      cast_scale.ShareDataWith(*scale);
-    }
-
-    // cast bias from LayerNormParamType to T if needed
-    phi::DenseTensor cast_bias(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(bias->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_bias.Resize(bias->dims());
-      cast_bias.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_bias =
-          NpuOpRunner("Cast",
-                      {*bias},
-                      {cast_bias},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_bias.Run(stream);
-    } else {
-      cast_bias.ShareDataWith(*bias);
-    }
-
-    y->mutable_data<T>(ctx.GetPlace());
-
-    // mean should be of  U type
-    phi::DenseTensor* tmp_mean = mean;
-    phi::DenseTensor cast_mean(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(scale->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(bias->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_mean.Resize(mean->dims());
-      cast_mean.mutable_data<T>(ctx.GetPlace());
-      tmp_mean = &cast_mean;
-      mean->mutable_data<U>(ctx.GetPlace());
-    } else {
-      mean->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // same for variance
-    phi::DenseTensor* tmp_variance = variance;
-    phi::DenseTensor cast_variance(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(scale->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(bias->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_variance.Resize(variance->dims());
-      cast_variance.mutable_data<T>(ctx.GetPlace());
-      tmp_variance = &cast_variance;
-      variance->mutable_data<U>(ctx.GetPlace());
-    } else {
-      variance->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const auto& runner = NpuOpRunner("LayerNorm",
-                                     {*x, cast_scale, cast_bias},
-                                     {*y, *tmp_mean, *tmp_variance},
-                                     {{"begin_norm_axis", begin_norm_axis},
-                                      {"begin_params_axis", begin_norm_axis},
-                                      {"epsilon", epsilon}});
-    runner.Run(stream);
-
-    // cast back from FP16 to FP32
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(mean->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(mean->type()));
-      const auto& runner_cast_mean =
-          NpuOpRunner("Cast",
-                      {*tmp_mean},
-                      {*mean},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_mean.Run(stream);
-    }
-    // same for variance
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(variance->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(variance->type()));
-      const auto& runner_cast_variance =
-          NpuOpRunner("Cast",
-                      {*tmp_variance},
-                      {*variance},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_variance.Run(stream);
-    }
-
-    // revert shape of scale and bias
-    // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
-    // tensor.
-    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
-    const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim({right}));
-  }
-};
-
-template <typename T>
-class LayerNormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using U = LayerNormParamType<T>;
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto& x_dims = x->dims();
-    const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dscale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    std::vector<int> axes;
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      axes.push_back(x_dims[i]);
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // No need to compute any gradient, jusr return
-    if (!dx && !dscale && !dbias) {
-      return;
-    }
-
-    // The rank of mean should be equal to x, required by Ascend.
-    std::vector<int> new_shape;
-    for (auto i = 0; i < begin_norm_axis; ++i) {
-      new_shape.push_back(x_dims[i]);
-    }
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      new_shape.push_back(1);
-    }
-
-    auto mean_dims = mean->dims();
-    const_cast<phi::DenseTensor*>(mean)->Resize(phi::make_ddim({new_shape}));
-    const_cast<phi::DenseTensor*>(variance)->Resize(
-        phi::make_ddim({new_shape}));
-
-    phi::DenseTensor default_scale(x->type());
-    if (!scale) {
-      default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
-      runner.Run(stream);
-      scale = &default_scale;
-    } else {
-      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
-    }
-
-    // cast scale from LayerNormParamType to T if needed
-    phi::DenseTensor cast_scale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(scale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_scale.Resize(scale->dims());
-      cast_scale.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_scale =
-          NpuOpRunner("Cast",
-                      {*scale},
-                      {cast_scale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_scale.Run(stream);
-    } else {
-      cast_scale.ShareDataWith(*scale);
-    }
-
-    // cast mean from LayerNormParamType to T if needed
-    phi::DenseTensor cast_mean(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(mean->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_mean.Resize(mean->dims());
-      cast_mean.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_mean =
-          NpuOpRunner("Cast",
-                      {*mean},
-                      {cast_mean},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_mean.Run(stream);
-    } else {
-      cast_mean.ShareDataWith(*mean);
-    }
-
-    // cast variance from LayerNormParamType to T if needed
-    phi::DenseTensor cast_variance(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(variance->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_variance.Resize(variance->dims());
-      cast_variance.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_variance =
-          NpuOpRunner("Cast",
-                      {*variance},
-                      {cast_variance},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_variance.Run(stream);
-    } else {
-      cast_variance.ShareDataWith(*variance);
-    }
-
-    phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
-    dx = (dx == nullptr) ? &dx_ : dx;
-    dscale = (dscale == nullptr) ? &dscale_ : dscale;
-    dbias = (dbias == nullptr) ? &dbias_ : dbias;
-
-    dx->Resize(x->dims());
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    dscale->Resize(phi::make_ddim(axes));
-
-    dbias->Resize(phi::make_ddim(axes));
-
-    // dscale should be of  U type
-    phi::DenseTensor* tmp_dscale = dscale;
-    phi::DenseTensor cast_dscale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(mean->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(variance->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_dscale.Resize(dscale->dims());
-      cast_dscale.mutable_data<T>(ctx.GetPlace());
-      tmp_dscale = &cast_dscale;
-      dscale->mutable_data<U>(ctx.GetPlace());
-    } else {
-      dscale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // same for dbias
-    phi::DenseTensor* tmp_dbias = dbias;
-    phi::DenseTensor cast_dbias(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(mean->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(variance->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_dbias.Resize(dbias->dims());
-      cast_dbias.mutable_data<T>(ctx.GetPlace());
-      tmp_dbias = &cast_dbias;
-      dbias->mutable_data<U>(ctx.GetPlace());
-    } else {
-      dbias->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const auto& runner =
-        NpuOpRunner("LayerNormGrad",
-                    {*dy, *x, cast_variance, cast_mean, cast_scale},
-                    {*dx, *tmp_dscale, *tmp_dbias},
-                    {});
-    runner.Run(stream);
-
-    // cast back from FP16 to FP32
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(dscale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(dscale->type()));
-      const auto& runner_cast_dscale =
-          NpuOpRunner("Cast",
-                      {*tmp_dscale},
-                      {*dscale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_dscale.Run(stream);
-    }
-    // same for dbias
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(dbias->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(dbias->type()));
-      const auto& runner_cast_dbias =
-          NpuOpRunner("Cast",
-                      {*tmp_dbias},
-                      {*dbias},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_dbias.Run(stream);
-    }
-
-    const_cast<phi::DenseTensor*>(mean)->Resize(mean_dims);
-    const_cast<phi::DenseTensor*>(variance)->Resize(mean_dims);
-    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
-    dscale->Resize(phi::make_ddim({right}));
-    dbias->Resize(phi::make_ddim({right}));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(layer_norm,
-                       ops::LayerNormNPUKernel<float>,
-                       ops::LayerNormNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(layer_norm_grad,
-                       ops::LayerNormGradNPUKernel<float>,
-                       ops::LayerNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
deleted file mode 100644
index 4b9b96c23b0b7..0000000000000
--- a/paddle/fluid/operators/load_combine_op_npu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/load_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
deleted file mode 100644
index 0e8517fd7b529..0000000000000
--- a/paddle/fluid/operators/load_op_npu.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class LoadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin),
-                      true,
-                      platform::errors::Unavailable(
-                          "Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          filename));
-
-    auto out_var_name = ctx.OutputNames("Out").data();
-    auto *out_var = ctx.OutputVar("Out");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::InvalidArgument(
-            "The variable %s to be loaded cannot be found.", out_var_name));
-
-    if (out_var->IsType<phi::DenseTensor>()) {
-      LoadLodTensor(fin, place, out_var, ctx);
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Load operator only supports loading phi::DenseTensor and "
-          "SelectedRows "
-          "variable, %s has wrong type",
-          out_var_name));
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin,
-                     const platform::Place &place,
-                     framework::Variable *var,
-                     const framework::ExecutionContext &ctx) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<phi::DenseTensor>();
-
-    auto seek = ctx.Attr<int64_t>("seek");
-
-    if (seek != -1) {
-      PADDLE_ENFORCE_GE(seek,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "seek witn tensor must great than or equal to 0"));
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      paddle::framework::DeserializeFromStream(
-          fin, tensor, dev_ctx, seek, shape);
-    } else {
-      paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx);
-    }
-
-    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->dtype();
-    auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type =
-          phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, in_dtype);
-      auto out_kernel_type =
-          phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, out_dtype);
-      phi::DenseTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(
-          in_kernel_type, out_kernel_type, *tensor, &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<phi::DenseTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin,
-                        const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<phi::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    load,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
deleted file mode 100644
index 0eb4ebe2442c1..0000000000000
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LogLossAdds(const platform::Place& place,
-                 const aclrtStream& stream,
-                 const phi::DenseTensor* x,
-                 float scale,
-                 phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scale}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossMuls(const platform::Place& place,
-                 const aclrtStream& stream,
-                 const phi::DenseTensor* x,
-                 float scale,
-                 phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scale}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossBCE(const platform::Place& place,
-                const aclrtStream& stream,
-                const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("BinaryCrossEntropy",
-                  {*x, *y},
-                  {*z},
-                  {{"reduction", static_cast<std::string>("none")}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossBCEGrad(const platform::Place& place,
-                    const aclrtStream& stream,
-                    const phi::DenseTensor* x,
-                    const phi::DenseTensor* y,
-                    const phi::DenseTensor* dout,
-                    phi::DenseTensor* dx) {
-  dx->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("BinaryCrossEntropyGrad",
-                  {*x, *y, *dout},
-                  {*dx},
-                  {{"reduction", static_cast<std::string>("none")}});
-  runner.Run(stream);
-}
-
-template <typename T, typename AttrType = T>
-class LogLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Output<phi::DenseTensor>("Loss");
-    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
-    auto* label = ctx.Input<phi::DenseTensor>("Labels");
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    float factor = 1 / (1 + 2 * epsilon);
-    float coef = std::log(factor);
-    LogLossAdds<T>(place, stream, pred, epsilon, y);
-    LogLossMuls<T>(place, stream, y, factor, y);
-    LogLossBCE<T>(place, stream, y, label, y);
-    LogLossAdds<T>(place, stream, y, coef, y);
-  }
-};
-
-template <typename T, typename AttrType = T>
-class LogLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
-    auto* label = ctx.Input<phi::DenseTensor>("Labels");
-    auto* dloss = ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* dpred =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Predicted"));
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dpred) {
-      LogLossBCEGrad<T>(place, stream, pred, label, dloss, dpred);
-      LogLossMuls<T>(place, stream, dpred, 1 / (1 + 2 * epsilon), dpred);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(log_loss, ops::LogLossNPUKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(log_loss_grad, ops::LogLossGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
deleted file mode 100644
index 34f9c11e066a7..0000000000000
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    if (X->numel() != 0) {
-      auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-      const auto& runner = NpuOpRunner(
-          "LogSoftmaxV2", {*X}, {*Out}, {{"axes", std::vector<int>{axis}}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    const int rank = dOut->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dX->mutable_data<T>(ctx.GetPlace());
-
-    if (dOut->numel() != 0) {
-      auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-      const auto& runner = NpuOpRunner("LogSoftmaxGrad",
-                                       {*dOut, *Out},
-                                       {*dX},
-                                       {{"axis", std::vector<int>{axis}}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(log_softmax,
-                       ops::LogSoftmaxNPUKernel<float>,
-                       ops::LogSoftmaxNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(log_softmax_grad,
-                       ops::LogSoftmaxGradNPUKernel<float>,
-                       ops::LogSoftmaxGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
deleted file mode 100644
index 8ae050541fb23..0000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-template <typename DeviceContext, typename T>
-class LookupTableV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");      // int tensor
-    auto *output_t = ctx.Output<phi::DenseTensor>("Out");  // float tensor
-    auto *table_t = ctx.Input<phi::DenseTensor>("W");
-
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::InvalidArgument("npu only accept phi::DenseTensor"));
-    output_t->mutable_data<T>(ctx.GetPlace());
-
-    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
-    if (padding_idx == kNoPadding) {
-      NpuOpRunner runner;
-      runner.SetType("GatherV2")
-          .AddInput(*table_t)
-          .AddInput(*ids_t)
-          .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-          .AddAttrs({{"batch_dims", 0}})
-#endif
-          .AddOutput(*output_t);
-      runner.Run();
-    } else {
-      phi::DenseTensor tmp_table_t(table_t->type());
-      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
-
-      phi::DenseTensor index;
-      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<int32_t>(&index,
-                                         static_cast<int32_t>(padding_idx));
-
-      auto updata_dim = phi::make_ddim({1, table_t->dims()[1]});
-      phi::DenseTensor update;
-      update.mutable_data<T>(updata_dim, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
-      update.Resize(updata_dim);
-
-      NpuOpRunner update_runner;
-      update_runner.SetType("TensorScatterUpdate")
-          .AddInput(*table_t)
-          .AddInput(index)
-          .AddInput(update)
-          .AddOutput(tmp_table_t);
-      update_runner.Run();
-
-      NpuOpRunner runner;
-      runner.SetType("GatherV2")
-          .AddInput(tmp_table_t)
-          .AddInput(*ids_t)
-          .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-          .AddAttrs({{"batch_dims", 0}})
-#endif
-          .AddOutput(*output_t);
-      runner.Run();
-    }
-  }
-};
-
-template <typename T>
-class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");
-    auto *output_grad_t =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *table_grad_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
-
-    /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.
-
-    int embedding_dim = table_grad_t->dims()[1];
-    if (embedding_dim % 32 == 0) {
-      // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in
-      // EmbeddingDenseGrad must be an integer multiple of 32.
-      int num_weights = table_grad_t->dims()[0];
-      const auto &runner =
-          NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
-                      {*table_grad_t}, {{"num_weights", num_weights},
-                                        {"padding_idx", -1},
-                                        {"scale_grad_by_freq", false}});
-      runner.Run(stream);
-      return;
-    }
-    */
-
-    const auto &runner_zeros =
-        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
-    runner_zeros.Run(stream);
-
-    if (padding_idx == kNoPadding) {
-      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-      // can be different tensor, but in cann 20.2+, it does inplace operation.
-      // Thus, the first input and output should be same tensor.
-      const auto &runner_scatter =
-          NpuOpRunner("ScatterAdd",
-                      {*table_grad_t, *ids_t, *output_grad_t},
-                      {*table_grad_t},
-                      {{"use_locking", true}});
-      runner_scatter.Run(stream);
-    } else {
-      phi::DenseTensor casted_ids_t;
-      if (framework::TransToProtoVarType(ids_t->dtype()) !=
-          framework::proto::VarType::INT32) {
-        casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
-        const auto &cast_runner = NpuOpRunner(
-            "Cast", {*ids_t}, {casted_ids_t}, {{"dst_type", ACL_INT32}});
-        cast_runner.Run(stream);
-      } else {
-        casted_ids_t.ShareDataWith(*ids_t);
-      }
-      auto table_grad_dims = table_grad_t->dims();
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(*output_grad_t)
-          .AddInput(casted_ids_t)
-          .AddInput(std::vector<int64_t>{table_grad_dims[0]})
-          .AddOutput(*table_grad_t);
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    lookup_table_v2,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    lookup_table_v2_grad,
-    ops::LookupTableV2GradNPUKernel<float>,
-    ops::LookupTableV2GradNPUKernel<int>,
-    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc
deleted file mode 100644
index 96fba4b968869..0000000000000
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MaskedSelectedNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("X");
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto out = ctx.Output<phi::DenseTensor>("Y");
-
-    auto input_dim = input->dims();
-    auto mask_dim = mask->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dim,
-        mask_dim,
-        platform::errors::InvalidArgument(
-            "The dim size of input and mask in OP(masked_selected) "
-            "must be equal, but got input dim:(%ld), mask dim: "
-            "(%ld). Please check input "
-            "value.",
-            input_dim,
-            mask_dim));
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    Tensor mask_int32, out_size;
-    std::vector<int32_t> out_size_vec;
-    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
-    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
-    {
-      const auto& cast_runner = NpuOpRunner(
-          "Cast",
-          {*mask},
-          {mask_int32},
-          {{"dst_type",
-            static_cast<int32_t>(
-                ConvertToNpuDtype(framework::proto::VarType::INT32))}});
-      cast_runner.Run(stream);
-
-      mask_int32.Resize({mask_int32.numel()});
-      NpuOpRunner sum_runner;
-      sum_runner.SetType("ReduceSum");
-      sum_runner.AddInput(mask_int32);
-      sum_runner.AddInput(std::vector<int32_t>({0}));
-      sum_runner.AddOutput(out_size);
-      sum_runner.AddAttr("keep_dims", false);
-      sum_runner.Run(stream);
-      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
-    }
-
-    out->Resize({out_size_vec[0]});
-    out->mutable_data<T>(ctx.GetPlace());
-
-    Tensor topkv2_out, indices;
-    topkv2_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    indices.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    {
-      NpuOpRunner topkv2_runner;
-      topkv2_runner.SetType("TopKV2")
-          .AddInput(mask_int32)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", false)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", true)
-          .Run(stream);
-      // TopKV2 may be unstable
-      NpuOpRunner topkv2_runner2;
-      topkv2_runner2.SetType("TopKV2")
-          .AddInput(indices)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", true)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", false)
-          .Run(stream);
-
-      Tensor input_tmp;
-      input_tmp.ShareDataWith(*input);
-      input_tmp.Resize({input->numel()});
-      const auto& gather_runner = NpuOpRunner(
-          "GatherV2D", {input_tmp, topkv2_out}, {*out}, {{"axis", 0}});
-      gather_runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class MaskedSelectedGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    Tensor mask_int32, out_size;
-    std::vector<int32_t> out_size_vec;
-    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
-    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
-    {
-      const auto& cast_runner = NpuOpRunner(
-          "Cast",
-          {*mask},
-          {mask_int32},
-          {{"dst_type",
-            static_cast<int32_t>(
-                ConvertToNpuDtype(framework::proto::VarType::INT32))}});
-      cast_runner.Run(stream);
-
-      mask_int32.Resize({mask_int32.numel()});
-      NpuOpRunner sum_runner;
-      sum_runner.SetType("ReduceSum");
-      sum_runner.AddInput(mask_int32);
-      sum_runner.AddInput(std::vector<int32_t>({0}));
-      sum_runner.AddOutput(out_size);
-      sum_runner.AddAttr("keep_dims", false);
-      sum_runner.Run(stream);
-      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
-    }
-
-    Tensor topkv2_out, indices;
-    topkv2_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    indices.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    {
-      NpuOpRunner topkv2_runner;
-      topkv2_runner.SetType("TopKV2")
-          .AddInput(mask_int32)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", false)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", true)
-          .Run(stream);
-
-      NpuOpRunner topkv2_runner2;
-      topkv2_runner2.SetType("TopKV2")
-          .AddInput(indices)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", true)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", false)
-          .Run(stream);
-
-      topkv2_out.Resize({out_size_vec[0], 1});
-      x_grad->Resize({x_grad->numel()});
-      NpuOpRunner scatter_runner;
-      scatter_runner.SetType("ScatterNd");
-      scatter_runner.AddInput(topkv2_out);
-      scatter_runner.AddInput(*y_grad);
-      scatter_runner.AddInput(
-          std::vector<int32_t>({static_cast<int32_t>(x_grad->numel())}));
-      scatter_runner.AddOutput(*x_grad);
-      scatter_runner.Run(stream);
-      x_grad->Resize(mask->dims());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(masked_select,
-                       ops::MaskedSelectedNPUKernel<plat::float16>,
-                       ops::MaskedSelectedNPUKernel<float>,
-                       ops::MaskedSelectedNPUKernel<int>,
-                       ops::MaskedSelectedNPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(masked_select_grad,
-                       ops::MaskedSelectedGradNPUKernel<plat::float16>,
-                       ops::MaskedSelectedGradNPUKernel<float>,
-                       ops::MaskedSelectedGradNPUKernel<int>,
-                       ops::MaskedSelectedGradNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 61cc7dc9f4b64..c52393c3e05ad 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -20,9 +20,8 @@ math_library(sampler DEPS generator)
 # math_library(math_function DEPS blas dense_tensor tensor)
 
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
-if(WITH_ASCEND_CL)
-  math_library(beam_search DEPS math_function beam_search_npu)
-elseif(WITH_XPU)
+
+if(WITH_XPU)
   math_library(beam_search DEPS math_function beam_search_xpu)
 else()
   math_library(beam_search DEPS math_function)
diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
deleted file mode 100644
index 937cd46d52888..0000000000000
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ /dev/null
@@ -1,588 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/phi/common/data_type.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class NPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class BeamSearchFunctor<platform::NPUDeviceContext, T> {
- public:
-  void operator()(const platform::NPUDeviceContext& ctx,
-                  const phi::DenseTensor* pre_ids,
-                  const phi::DenseTensor* pre_scores,
-                  const phi::DenseTensor* ids,
-                  const phi::DenseTensor* scores,
-                  phi::DenseTensor* selected_ids,
-                  phi::DenseTensor* selected_scores,
-                  phi::DenseTensor* parent_idx,
-                  size_t level,
-                  size_t beam_size,
-                  int end_id,
-                  bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-    auto& high_level = abs_lod[level];
-
-    int64_t num_seqs = scores->NumElements(level);
-    // size of the first beam is 1, others are equal to beam_size
-    int64_t real_beam_size = static_cast<int64_t>(scores->dims()[0] / num_seqs);
-    // K
-    int64_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream = ctx.stream();
-
-    int64_t total_length = num_seqs * beam_size;
-    int64_t batch_size = static_cast<int64_t>(scores->dims()[0]);
-    selected_ids->mutable_data<int64_t>(phi::make_ddim({total_length, 1}),
-                                        place);
-    selected_scores->mutable_data<float>(phi::make_ddim({total_length, 1}),
-                                         place);
-    parent_idx->mutable_data<int64_t>(phi::make_ddim({total_length}), place);
-
-    // Step1: Define Tensors and Preprocess the situation that pre_id == end_id
-
-    // cast ids and pre_ids from int to float32
-    Tensor ids_int32(phi::DataType::INT32);
-    if (framework::TransToProtoVarType(ids->dtype()) !=
-        framework::proto::VarType::INT32) {
-      ids_int32.Resize(ids->dims());
-      ids_int32.mutable_data<int>(ctx.GetPlace());
-      auto dst_dtype_ids_int32 =
-          ConvertToNpuDtype(framework::TransToProtoVarType(ids_int32.dtype()));
-      const auto& runner_ids_int32 =
-          NpuOpRunner("Cast",
-                      {*ids},
-                      {ids_int32},
-                      {{"dst_type", static_cast<int>(dst_dtype_ids_int32)}});
-      runner_ids_int32.Run(stream);
-    } else {
-      ids_int32.ShareDataWith(*ids);
-    }
-
-    Tensor pre_ids_int32(phi::DataType::INT32);
-    if (framework::TransToProtoVarType(pre_ids->dtype()) !=
-        framework::proto::VarType::INT32) {
-      pre_ids_int32.Resize(pre_ids->dims());
-      pre_ids_int32.mutable_data<int>(ctx.GetPlace());
-      auto dst_dtype_pre_ids_int32 = ConvertToNpuDtype(
-          framework::TransToProtoVarType(pre_ids_int32.dtype()));
-      const auto& runner_pre_ids_int32 = NpuOpRunner(
-          "Cast",
-          {*pre_ids},
-          {pre_ids_int32},
-          {{"dst_type", static_cast<int>(dst_dtype_pre_ids_int32)}});
-      runner_pre_ids_int32.Run(stream);
-    } else {
-      pre_ids_int32.ShareDataWith(*pre_ids);
-    }
-
-    Tensor expand_pre_ids(pre_ids_int32.dtype());
-    expand_pre_ids.Resize(phi::make_ddim({batch_size, seq_width}));
-    expand_pre_ids.mutable_data<int>(place);
-    const auto& runner_tile_pre_ids =
-        NpuOpRunner("TileWithAxis",
-                    {pre_ids_int32},
-                    {expand_pre_ids},
-                    {{"axis", 1}, {"tiles", seq_width}});
-    runner_tile_pre_ids.Run(stream);
-    expand_pre_ids.Resize(ids_int32.dims());
-
-    Tensor expand_pre_scores(pre_scores->dtype());
-    expand_pre_scores.Resize(phi::make_ddim({batch_size, seq_width}));
-    expand_pre_scores.mutable_data<float>(place);
-    const auto& runner_tile_pre_scores =
-        NpuOpRunner("TileWithAxis",
-                    {*pre_scores},
-                    {expand_pre_scores},
-                    {{"axis", 1}, {"tiles", seq_width}});
-    runner_tile_pre_scores.Run(stream);
-    expand_pre_scores.Resize(scores->dims());
-
-    // End_id Tensors
-    Tensor end_id_tmp_tensor(phi::DataType::INT32);
-    end_id_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&end_id_tmp_tensor, end_id);
-
-    Tensor end_id_tensors(ids_int32.dtype());
-    end_id_tensors.mutable_data<int>(ids_int32.dims(), place);
-    const auto& runner_fill_end_id =
-        NpuOpRunner("FillD",
-                    {end_id_tmp_tensor},
-                    {end_id_tensors},
-                    {{"dims", phi::vectorize(ids_int32.dims())}});
-    runner_fill_end_id.Run(stream);
-
-    // whether expand_pre_ids == end_ids?
-    Tensor equal_end_ids(phi::DataType::BOOL);
-    equal_end_ids.mutable_data<bool>(ids_int32.dims(), place);
-    const auto& runner_equal_end_ids = NpuOpRunner(
-        "Equal", {expand_pre_ids, end_id_tensors}, {equal_end_ids}, {});
-    runner_equal_end_ids.Run(stream);
-
-    // construct a Tensor with dimension ids->dims():
-    // [[False, True, True, True, ...],
-    //  [False, True, True, True, ...],
-    //  ...]
-    Tensor false_tmp_tensor(phi::DataType::INT32);
-    false_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&false_tmp_tensor, static_cast<int>(false));
-
-    Tensor first_pos_false_tensors(phi::DataType::INT32);
-    first_pos_false_tensors.Resize(phi::make_ddim({batch_size, 1}));
-    first_pos_false_tensors.mutable_data<int>(place);
-    std::vector<int64_t> fill_dims = {batch_size, 1};
-    framework::NPUAttributeMap fill_attr = {{"dims", fill_dims}};
-    const auto& runner_fill_false_tensors = NpuOpRunner(
-        "FillD", {false_tmp_tensor}, {first_pos_false_tensors}, fill_attr);
-    runner_fill_false_tensors.Run(stream);
-
-    Tensor pos_tensors(phi::DataType::INT32);
-    if (seq_width > 1) {
-      pos_tensors.Resize(phi::make_ddim({batch_size, seq_width}));
-      pos_tensors.mutable_data<int>(place);
-
-      Tensor true_tmp_tensor(phi::DataType::INT32);
-      true_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<int>(&true_tmp_tensor, static_cast<int>(true));
-
-      Tensor second_pos_true_tensors(phi::DataType::INT32);
-      second_pos_true_tensors.Resize(
-          phi::make_ddim({batch_size, seq_width - 1}));
-      second_pos_true_tensors.mutable_data<int>(place);
-      std::vector<int64_t> fill_dims2 = {batch_size, seq_width - 1};
-      framework::NPUAttributeMap fill_attr2 = {{"dims", fill_dims2}};
-      const auto& runner_fill_true_tensors = NpuOpRunner(
-          "FillD", {true_tmp_tensor}, {second_pos_true_tensors}, fill_attr2);
-      runner_fill_true_tensors.Run(stream);
-
-      std::vector<phi::DenseTensor> concat_inputs = {first_pos_false_tensors,
-                                                     second_pos_true_tensors};
-      std::vector<std::string> concat_names = {"x0", "x1"};
-      NpuOpRunner runner_concat_false_true{"ConcatD",
-                                           {concat_inputs},
-                                           {pos_tensors},
-                                           {{"concat_dim", 1}, {"N", 2}}};
-      runner_concat_false_true.AddInputNames(concat_names);
-      runner_concat_false_true.Run(stream);
-      pos_tensors.Resize(ids_int32.dims());
-    } else {
-      pos_tensors.ShareDataWith(first_pos_false_tensors);
-    }
-
-    Tensor cast_pos_tensors_bool(phi::DataType::BOOL);
-    cast_pos_tensors_bool.Resize(pos_tensors.dims());
-    cast_pos_tensors_bool.mutable_data<bool>(ctx.GetPlace());
-    auto dst_dtype = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_pos_tensors_bool.type()));
-    const auto& runner_cast_pos_tensors =
-        NpuOpRunner("Cast",
-                    {pos_tensors},
-                    {cast_pos_tensors_bool},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_pos_tensors.Run(stream);
-
-    // if pre_ids == end_ids, save only one score, and others become -inf
-    // construct pre_ids == end_ids and save only one score
-    Tensor save_one_end_score(phi::DataType::BOOL);
-    save_one_end_score.mutable_data<bool>(ids_int32.dims(), place);
-    const auto& runner_logical_and =
-        NpuOpRunner("LogicalAnd",
-                    {equal_end_ids, cast_pos_tensors_bool},
-                    {save_one_end_score},
-                    {});
-    runner_logical_and.Run(stream);
-
-    // if save_one_end_score is True, set score to -inf
-    // define -Inf Tensors
-    Tensor ninf_tmp_tensor(scores->dtype());
-    ninf_tmp_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float ninf_value =
-        static_cast<float>(-std::numeric_limits<float>::infinity());
-    FillNpuTensorWithConstant<float>(&ninf_tmp_tensor, ninf_value);
-
-    Tensor ninf_tensors(scores->dtype());
-    ninf_tensors.mutable_data<float>(scores->dims(), place);
-    const auto& runner_fill_ninf =
-        NpuOpRunner("FillD",
-                    {ninf_tmp_tensor},
-                    {ninf_tensors},
-                    {{"dims", phi::vectorize(scores->dims())}});
-    runner_fill_ninf.Run(stream);
-
-    // Step2: calculate topk scores
-
-    // get scores used in topk op
-    Tensor tmp_scores(scores->dtype());
-    tmp_scores.mutable_data<float>(scores->dims(), place);
-    if (!is_accumulated) {
-      // if pre_id == end_id, cal_scores = pre_score, and id = end_id
-      // else, cal_score = pre_score + log(score)
-
-      // calculate log(scores)
-      Tensor log_scores(scores->dtype());
-      log_scores.mutable_data<float>(scores->dims(), place);
-
-      Tensor one(scores->dtype());
-      one.mutable_data<float>(scores->dims(), place);
-      const auto& runner_one = NpuOpRunner("OnesLike", {*scores}, {one}, {});
-      runner_one.Run(stream);
-
-      Tensor sub(scores->dtype());
-      sub.mutable_data<float>(scores->dims(), place);
-      const auto& runner_sub = NpuOpRunner("Sub", {*scores, one}, {sub}, {});
-      runner_sub.Run(stream);
-
-      const auto& runner_log_scores =
-          NpuOpRunner("Log1p", {sub}, {log_scores}, {});
-      runner_log_scores.Run(stream);
-
-      // tmp_scores = pre_score + log(scores)
-      const auto& runner_add_scores =
-          NpuOpRunner("Add", {log_scores, *pre_scores}, {tmp_scores}, {});
-      runner_add_scores.Run(stream);
-
-      // if pre_ids == end_ids, use pre_score rather than score
-      const auto& runner_select_equal_end_score =
-          NpuOpRunner("Select",
-                      {equal_end_ids, expand_pre_scores, tmp_scores},
-                      {tmp_scores},
-                      {});
-      runner_select_equal_end_score.Run(stream);
-    } else {
-      // if pre_ids == end_ids, use pre_score rather than score
-      const auto& runner_select_equal_end_score2 =
-          NpuOpRunner("Select",
-                      {equal_end_ids, expand_pre_scores, *scores},
-                      {tmp_scores},
-                      {});
-      runner_select_equal_end_score2.Run(stream);
-    }
-
-    // if pre_ids == end_ids, save only one score, and others become -inf
-    Tensor cal_scores(scores->dtype());
-    cal_scores.mutable_data<float>(scores->dims(), place);
-    const auto& runner_select_inf_score =
-        NpuOpRunner("Select",
-                    {save_one_end_score, ninf_tensors, tmp_scores},
-                    {cal_scores},
-                    {});
-    runner_select_inf_score.Run(stream);
-
-    // resize scores from [num_seqs * beam_size, K] to [num_seqs, beam_size * K]
-    // real_beam_size = 1 or beam_size
-    cal_scores.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width}));
-
-    Tensor topk_scores(scores->dtype());
-    topk_scores.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    topk_scores.mutable_data<float>(ctx.GetPlace());
-
-    Tensor tmp_indices(phi::DataType::INT32);
-    tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    tmp_indices.mutable_data<int>(ctx.GetPlace());
-
-    // run topk op
-    NpuOpRunner runner_topk;
-    runner_topk.SetType("TopKV2")
-        .AddInput(cal_scores)
-        .AddInput(std::vector<int>{static_cast<int>(beam_size)})
-        .AddOutput(topk_scores)
-        .AddOutput(tmp_indices)
-        .AddAttr("sorted", true)
-        .AddAttr("dim", -1)
-        .AddAttr("largest", true);
-    runner_topk.Run(stream);
-
-    // cast tmp_indices from int to float32 for Sort op
-    Tensor cast_tmp_indices(phi::DataType::FLOAT32);
-    cast_tmp_indices.Resize(tmp_indices.dims());
-    cast_tmp_indices.mutable_data<float>(ctx.GetPlace());
-    auto dst_dtype_tmp_indices_fp32 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_tmp_indices.type()));
-    const auto& runner_cast_tmp_indices = NpuOpRunner(
-        "Cast",
-        {tmp_indices},
-        {cast_tmp_indices},
-        {{"dst_type", static_cast<int>(dst_dtype_tmp_indices_fp32)}});
-    runner_cast_tmp_indices.Run(stream);
-
-    // sort tmp_indices
-    Tensor sorted_tmp_indices(phi::DataType::FLOAT32);
-    sorted_tmp_indices.Resize(tmp_indices.dims());
-    sorted_tmp_indices.mutable_data<float>(ctx.GetPlace());
-    Tensor sorted_score_indices(phi::DataType::INT32);
-    sorted_score_indices.Resize(tmp_indices.dims());
-    sorted_score_indices.mutable_data<int>(ctx.GetPlace());
-    const auto& runner_sort_tmp_indices =
-        NpuOpRunner("Sort",
-                    {cast_tmp_indices},
-                    {sorted_tmp_indices, sorted_score_indices},
-                    {{"axis", 1}, {"descending", false}});
-    runner_sort_tmp_indices.Run(stream);
-
-    // cast sorted_tmp_indices from float32 to int
-    Tensor cast_sort_tmp_indices(phi::DataType::INT32);
-    cast_sort_tmp_indices.Resize(sorted_tmp_indices.dims());
-    cast_sort_tmp_indices.mutable_data<int>(ctx.GetPlace());
-    auto dst_dtype_tmp_indices_int32 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_sort_tmp_indices.type()));
-    const auto& runner_cast_sort_tmp_indices = NpuOpRunner(
-        "Cast",
-        {sorted_tmp_indices},
-        {cast_sort_tmp_indices},
-        {{"dst_type", static_cast<int>(dst_dtype_tmp_indices_int32)}});
-    runner_cast_sort_tmp_indices.Run(stream);
-
-    // Step 3: infer selected ids from tmp_indices and ids
-
-    // if pre_ids == end_ids, use pre_ids rather than ids
-    Tensor cal_ids(ids_int32.dtype());
-    cal_ids.mutable_data<int>(ids_int32.dims(), place);
-    const auto& runner_select_equal_end_id = NpuOpRunner(
-        "Select", {equal_end_ids, expand_pre_ids, ids_int32}, {cal_ids}, {});
-    runner_select_equal_end_id.Run(stream);
-
-    // resize ids from [num_seqs * real_beam_size, K] to [num_seqs,
-    // real_beam_size * K]
-    // real_beam_size = 1 or beam_size
-    cal_ids.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width}));
-
-    // construct batch_ids like [[0, 0, 0], [1, 1, 1], ..., [bs-1, bs-1, bs-1]]
-    // construct arange(num_seqs*beam_size).reshape((num_seqs, beam_size)) //
-    // beam_size
-    Tensor batch_ids(phi::DataType::INT32);
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    batch_ids.mutable_data<int>(place);
-
-    std::vector<int> vector_batch_ids;
-    for (int i = 0; i < num_seqs * static_cast<int>(beam_size); ++i) {
-      vector_batch_ids.push_back(static_cast<int>(i / beam_size));
-    }
-    framework::TensorFromVector(vector_batch_ids, ctx, &batch_ids);
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-
-    // sort topk_scores to get selected_scores
-    // get indices of gather_nd op for calculating selected_scores
-    Tensor gather_nd_score_indices(phi::DataType::INT32);
-    gather_nd_score_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 2}));
-    gather_nd_score_indices.mutable_data<int>(place);
-
-    sorted_score_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    std::vector<phi::DenseTensor> concat_inputs2 = {batch_ids,
-                                                    sorted_score_indices};
-    std::vector<std::string> concat_names = {"x0", "x1"};
-    NpuOpRunner runner_concat_score_indices{"ConcatD",
-                                            {concat_inputs2},
-                                            {gather_nd_score_indices},
-                                            {{"concat_dim", 2}, {"N", 2}}};
-    runner_concat_score_indices.AddInputNames(concat_names);
-    runner_concat_score_indices.Run(stream);
-
-    // use gather_nd to get selected_scores
-    const auto& runner_gather_nd_scores =
-        NpuOpRunner("GatherNd",
-                    {topk_scores, gather_nd_score_indices},
-                    {*selected_scores},
-                    {});
-    runner_gather_nd_scores.Run(stream);
-
-    // get indices of gather_nd op
-    cast_sort_tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    Tensor gather_nd_id_indices(phi::DataType::INT32);
-    gather_nd_id_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 2}));
-    gather_nd_id_indices.mutable_data<int>(place);
-
-    std::vector<phi::DenseTensor> concat_inputs3 = {batch_ids,
-                                                    cast_sort_tmp_indices};
-    NpuOpRunner runner_concat_id_indices{"ConcatD",
-                                         {concat_inputs3},
-                                         {gather_nd_id_indices},
-                                         {{"concat_dim", 2}, {"N", 2}}};
-    runner_concat_id_indices.AddInputNames(concat_names);
-    runner_concat_id_indices.Run(stream);
-
-    // use gather_nd to get selected_ids
-    Tensor topk_ids(phi::DataType::INT32);
-    topk_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    topk_ids.mutable_data<int>(ctx.GetPlace());
-
-    const auto& runner_gather_nd_ids = NpuOpRunner(
-        "GatherNd", {cal_ids, gather_nd_id_indices}, {topk_ids}, {});
-    runner_gather_nd_ids.Run(stream);
-
-    // cast topk_ids from int to int64 to get selected_ids
-    auto dst_dtype_selected_ids =
-        ConvertToNpuDtype(framework::TransToProtoVarType(selected_ids->type()));
-    const auto& runner_cast_selected_ids =
-        NpuOpRunner("Cast",
-                    {topk_ids},
-                    {*selected_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype_selected_ids)}});
-    runner_cast_selected_ids.Run(stream);
-
-    // TODO(pangyoki): PruneEndBeams
-
-    // Step 4: set lod of output Tensor
-    // define Tensor with value `seq_width`
-    Tensor seq_width_tensor(phi::DataType::INT32);
-    seq_width_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&seq_width_tensor,
-                                   static_cast<int>(seq_width));
-
-    // beam_ids = tmp_indices // seq_width
-    Tensor beam_ids(phi::DataType::INT32);
-    beam_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    beam_ids.mutable_data<int>(ctx.GetPlace());
-    cast_sort_tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-
-    const auto& runner_div = NpuOpRunner(
-        "Div", {cast_sort_tmp_indices, seq_width_tensor}, {beam_ids}, {});
-    runner_div.Run(stream);
-
-    // get parent_idx by adding batch_ids to beam_ids
-    // construct scale_batch_ids like [[0, 0, 0], [bw, bw, bw], ..., [bs-1*bw,
-    // bs-1*bw, bs-1*bw]]
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-
-    // cast batch_ids from int to float32
-    Tensor cast_batch_ids(phi::DataType::FLOAT32);
-    cast_batch_ids.Resize(batch_ids.dims());
-    cast_batch_ids.mutable_data<float>(ctx.GetPlace());
-    auto dst_dtype1 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_batch_ids.type()));
-    const auto& runner_cast_batch_ids =
-        NpuOpRunner("Cast",
-                    {batch_ids},
-                    {cast_batch_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype1)}});
-    runner_cast_batch_ids.Run(stream);
-
-    // scale batch_ids with beam_size
-    Tensor scale_batch_ids(phi::DataType::FLOAT32);
-    scale_batch_ids.Resize(batch_ids.dims());
-    scale_batch_ids.mutable_data<float>(place);
-    const auto& runner_power =
-        NpuOpRunner("Power",
-                    {cast_batch_ids},
-                    {scale_batch_ids},
-                    {{"power", static_cast<float>(1.0)},
-                     {"scale", static_cast<float>(beam_size)},
-                     {"shift", static_cast<float>(0.0)}});
-    runner_power.Run(stream);
-
-    // cast cast_scale_batch_ids from float32 to int
-    Tensor cast_scale_batch_ids(phi::DataType::INT32);
-    cast_scale_batch_ids.Resize(scale_batch_ids.dims());
-    cast_scale_batch_ids.mutable_data<int>(ctx.GetPlace());
-    auto dst_dtype2 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_scale_batch_ids.type()));
-    const auto& runner_cast_scale_batch_ids =
-        NpuOpRunner("Cast",
-                    {scale_batch_ids},
-                    {cast_scale_batch_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype2)}});
-    runner_cast_scale_batch_ids.Run(stream);
-
-    // calculate parent_idx
-    Tensor tmp_parent_idx(phi::DataType::INT32);
-    tmp_parent_idx.Resize(parent_idx->dims());
-    tmp_parent_idx.mutable_data<int>(place);
-    const auto& runner_add_beam_id = NpuOpRunner(
-        "Add", {beam_ids, cast_scale_batch_ids}, {tmp_parent_idx}, {});
-    runner_add_beam_id.Run(stream);
-
-    // cast tmp_parent_idx from int to int64 to get parent_idx
-    auto dst_dtype_parent_idx =
-        ConvertToNpuDtype(framework::TransToProtoVarType(parent_idx->type()));
-    const auto& runner_cast_parent_idx =
-        NpuOpRunner("Cast",
-                    {tmp_parent_idx},
-                    {*parent_idx},
-                    {{"dst_type", static_cast<int>(dst_dtype_parent_idx)}});
-    runner_cast_parent_idx.Run(stream);
-
-    std::vector<int> vector_parent_idx;
-    framework::TensorToVector(tmp_parent_idx, ctx, &vector_parent_idx);
-
-    // set low level, len(low_level) = high_level[-1]
-    std::vector<int> low_level;
-    std::vector<int> num_parent_ids(num_seqs * beam_size,
-                                    static_cast<int64_t>(0));
-    size_t low_level_size = high_level[num_seqs];
-    size_t sum_parent_id = 0;
-
-    // calculate number of every parent_id
-    for (size_t i = 0; i < num_seqs * beam_size; ++i) {
-      num_parent_ids[vector_parent_idx[i]]++;
-    }
-
-    // update low_level
-    low_level.push_back(0);
-    for (size_t i = 0; i < low_level_size; ++i) {
-      sum_parent_id += num_parent_ids[i];
-      low_level.push_back(sum_parent_id);
-    }
-
-    // fill lod
-    framework::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "lod %s is not right in"
-          " beam_search, please check your code.",
-          framework::LoDToString(lod)));
-    }
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-};
-
-template class BeamSearchFunctor<platform::NPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
deleted file mode 100644
index d49d9a319ccff..0000000000000
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ /dev/null
@@ -1,561 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void Mul(const framework::ExecutionContext& ctx,
-                const aclrtStream& stream,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out,
-                const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
-    runner_dx.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
-    runner_dx.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void Dot(const framework::ExecutionContext& ctx,
-                const aclrtStream& stream,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out,
-                const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner =
-        NpuOpRunner("MatMul",
-                    {X, Y},
-                    {*Out},
-                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner =
-        NpuOpRunner("MatMul",
-                    {X, Y},
-                    {Out_temp},
-                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner =
-        NpuOpRunner("BatchMatMul",
-                    {X, Y},
-                    {*Out},
-                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner =
-        NpuOpRunner("BatchMatMul",
-                    {X, Y},
-                    {Out_temp},
-                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& brd_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = brd_dims.size();
-  int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename DeviceContext, typename T>
-class MatMulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      PADDLE_ENFORCE_EQ(
-          X->numel(),
-          Y->numel(),
-          platform::errors::InvalidArgument(
-              "X's numbers must be equal to Y's numbers,"
-              "when X/Y's dims =1. But received X has [%d] elements,"
-              "received Y has [%d] elements",
-              X->numel(),
-              Y->numel()));
-      Out->Resize({1});
-      Dot<T>(ctx, stream, *X, *Y, Out, alpha);
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (transpose_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (x_ndim == 2 && y_ndim == 2) {
-      MatMul2D<T>(
-          ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-      return;
-    }
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (transpose_x == false && y_ndim == 2) {
-      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
-      x_temp.Resize(phi::make_ddim(vec_dim));
-      MatMul2D<T>(
-          ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->dtype());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->dtype());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-    MatMulND<T>(ctx,
-                stream,
-                x_temp_brd,
-                y_temp_brd,
-                Out,
-                transpose_x,
-                transpose_y,
-                alpha);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MatMulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      phi::DenseTensor dout_temp(dOut->dtype());
-      dout_temp.Resize(X->dims());
-      dout_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(*dOut)
-          .AddInput(std::move(x_dims))
-          .AddOutput(dout_temp)
-          .Run(stream);
-
-      if (dX) {
-        Mul<T>(ctx, stream, dout_temp, *Y, dX, alpha);
-      }
-      if (dY) {
-        Mul<T>(ctx, stream, dout_temp, *X, dY, alpha);
-      }
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (transpose_x) {
-          MatMul2D<T>(
-              ctx, stream, y_temp, dout_temp, dX, transpose_y, true, alpha);
-        } else {
-          MatMul2D<T>(
-              ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (transpose_y) {
-          MatMul2D<T>(
-              ctx, stream, dout_temp, x_temp, dY, true, transpose_x, alpha);
-        } else {
-          MatMul2D<T>(
-              ctx, stream, x_temp, dout_temp, dY, !transpose_x, false, alpha);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-
-    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (transpose_x == false && y_ndim == 2) {
-      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
-      dout_temp.Resize(
-          phi::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_vec_dim));
-        MatMul2D<T>(
-            ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha);
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        x_temp.Resize(phi::make_ddim(x_vec_dim));
-        if (transpose_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false, alpha);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false, alpha);
-        }
-      }
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->dtype());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->dtype());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-
-    if (dX) {
-      if (x_dims == x_broadcast_dims) {
-        if (transpose_x) {
-          MatMulND<T>(
-              ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true, alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      y_temp_brd,
-                      dX,
-                      false,
-                      !transpose_y,
-                      alpha);
-        }
-      } else {
-        phi::DenseTensor dx_temp(X->dtype());
-        dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
-        if (transpose_x) {
-          MatMulND<T>(ctx,
-                      stream,
-                      y_temp_brd,
-                      dout_temp,
-                      &dx_temp,
-                      transpose_y,
-                      true,
-                      alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      y_temp_brd,
-                      &dx_temp,
-                      false,
-                      !transpose_y,
-                      alpha);
-        }
-        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      if (y_dims == y_broadcast_dims) {
-        if (transpose_y) {
-          MatMulND<T>(
-              ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x, alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      x_temp_brd,
-                      dout_temp,
-                      dY,
-                      !transpose_x,
-                      false,
-                      alpha);
-        }
-      } else {
-        phi::DenseTensor dy_temp(Y->dtype());
-        dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
-        if (transpose_y) {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      x_temp_brd,
-                      &dy_temp,
-                      true,
-                      transpose_x,
-                      alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      x_temp_brd,
-                      dout_temp,
-                      &dy_temp,
-                      !transpose_x,
-                      false,
-                      alpha);
-        }
-        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    matmul,
-    ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    matmul_grad,
-    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
deleted file mode 100644
index 2a398fbb5499b..0000000000000
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/matmul_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner =
-      NpuOpRunner("MatMul",
-                  {X, Y},
-                  {*Out},
-                  {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-  runner.Run(stream);
-}
-
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner("BatchMatMul",
-                                   {X, Y},
-                                   {*Out},
-                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-  runner.Run(stream);
-}
-
-#if (CANN_VERSION_CODE < 504000)
-template <>
-void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
-                                   const aclrtStream& stream,
-                                   const phi::DenseTensor& X,
-                                   const phi::DenseTensor& Y,
-                                   phi::DenseTensor* Out,
-                                   const bool trans_x,
-                                   const bool trans_y) {
-  Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
-  phi::DenseTensor x_fp32, y_fp32, out_fp32;
-  x_fp32.Resize(X.dims());
-  y_fp32.Resize(Y.dims());
-  out_fp32.Resize(Out->dims());
-  x_fp32.mutable_data<float>(ctx.GetPlace());
-  y_fp32.mutable_data<float>(ctx.GetPlace());
-  out_fp32.mutable_data<float>(ctx.GetPlace());
-
-  const auto& cast_x =
-      NpuOpRunner("Cast",
-                  {X},
-                  {x_fp32},
-                  {{"dst_type",
-                    static_cast<int>(ConvertToNpuDtype(
-                        framework::TransToProtoVarType(x_fp32.type())))}});
-  cast_x.Run(stream);
-  const auto& cast_y =
-      NpuOpRunner("Cast",
-                  {Y},
-                  {y_fp32},
-                  {{"dst_type",
-                    static_cast<int>(ConvertToNpuDtype(
-                        framework::TransToProtoVarType(y_fp32.type())))}});
-  cast_y.Run(stream);
-
-  const auto& runner = NpuOpRunner("BatchMatMul",
-                                   {x_fp32, y_fp32},
-                                   {out_fp32},
-                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-  runner.Run(stream);
-
-  const auto& cast_out = NpuOpRunner(
-      "Cast",
-      {out_fp32},
-      {*Out},
-      {{"dst_type",
-        static_cast<int>(
-            ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}});
-  cast_out.Run(stream);
-}
-#endif
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& brd_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = brd_dims.size();
-  int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class MatMulV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      PADDLE_ENFORCE_EQ(
-          X->numel(),
-          Y->numel(),
-          platform::errors::InvalidArgument(
-              "X's numbers must be equal to Y's numbers,"
-              "when X/Y's dims =1. But received X has [%d] elements,"
-              "received Y has [%d] elements",
-              X->numel(),
-              Y->numel()));
-      Out->Resize({1});
-      Out->mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out});
-      runner.Run(stream);
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (x_ndim == 2 && y_ndim == 2) {
-      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
-      return;
-    }
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (trans_x == false && y_ndim == 2) {
-      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
-      x_temp.Resize(phi::make_ddim(vec_dim));
-      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->type());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->type());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y);
-  }
-};
-
-template <typename T>
-class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      phi::DenseTensor dout_temp(dOut->type());
-      dout_temp.Resize(X->dims());
-      dout_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(*dOut)
-          .AddInput(std::move(x_dims))
-          .AddOutput(dout_temp)
-          .Run(stream);
-
-      if (dX) {
-        dX->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {});
-        runner_dx.Run(stream);
-      }
-      if (dY) {
-        dY->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {});
-        runner_dy.Run(stream);
-      }
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (trans_x) {
-          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, trans_y, true);
-        } else {
-          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (trans_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, trans_x);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !trans_x, false);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-
-    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (trans_x == false && y_ndim == 2) {
-      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
-      dout_temp.Resize(
-          phi::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_vec_dim));
-        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        x_temp.Resize(phi::make_ddim(x_vec_dim));
-        if (trans_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false);
-        }
-      }
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->type());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->type());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-
-    if (dX) {
-      if (x_dims == x_broadcast_dims) {
-        if (trans_x) {
-          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true);
-        } else {
-          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
-        }
-      } else {
-        phi::DenseTensor dx_temp(X->type());
-        dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
-        if (trans_x) {
-          MatMulND<T>(
-              ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, true);
-        } else {
-          MatMulND<T>(
-              ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, !trans_y);
-        }
-        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      if (y_dims == y_broadcast_dims) {
-        if (trans_y) {
-          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x);
-        } else {
-          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
-        }
-      } else {
-        phi::DenseTensor dy_temp(Y->type());
-        dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
-        if (trans_y) {
-          MatMulND<T>(
-              ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, trans_x);
-        } else {
-          MatMulND<T>(
-              ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, false);
-        }
-        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(matmul_v2,
-                       ops::MatMulV2NPUKernel<float>,
-                       ops::MatMulV2NPUKernel<paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(matmul_v2_grad,
-                       ops::MatMulV2GradNPUKernel<float>,
-                       ops::MatMulV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
deleted file mode 100644
index 3df6a6a04d541..0000000000000
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MeanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<int> axes;
-
-    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                             {"axes", axes}};
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MeanGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto grad = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Mean Gradient Input phi::DenseTensor len should be 1. But "
-            "received Out@Grad's elements num is %d.",
-            grad->numel()));
-
-    auto IG = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    IG->mutable_data<T>(context.GetPlace());
-
-    // ones
-    phi::DenseTensor ones(grad->dtype());
-    ones.mutable_data<T>(IG->dims(), context.GetPlace());
-    const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
-    runner_ones.Run(stream);
-
-    // means
-    phi::DenseTensor mean_tensor(grad->dtype());
-    mean_tensor.Resize({1});
-    mean_tensor.mutable_data<T>(context.GetPlace());
-    FillNpuTensorWithConstant<T>(
-        &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
-
-    // means mul ones
-    phi::DenseTensor mean_ma(grad->dtype());
-    mean_ma.Resize(IG->dims());
-    mean_ma.mutable_data<T>(context.GetPlace());
-    const auto& runner_mul_1 =
-        NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
-    runner_mul_1.Run(stream);
-
-    // and mul grad
-    const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    mean,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
-
-REGISTER_OP_NPU_KERNEL(
-    mean_grad,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
deleted file mode 100644
index e60af8bd480ea..0000000000000
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MeshgridNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto ins = context.MultiInput<phi::DenseTensor>("X");
-    auto outs = context.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        (ins.size() > 1) && (ins.size() < 7),
-        true,
-        platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 2 and 6, but only received d% .",
-            ins.size()));
-
-    int64_t size = ins.size();
-    std::vector<int64_t> shape(size);
-
-    for (int64_t i = 0; i < size; i++) {
-      switch (ins[i]->dims().size()) {
-        case 0:
-          shape[i] = 1;
-          break;
-        case 1:
-          shape[i] = ins[i]->dims()[0];
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected scalar or 1D tensor in the tensor list but got tensor "
-              "%d: ",
-              i));
-      }
-    }
-
-    for (int64_t i = 0; i < size; i++) {
-      std::vector<int64_t> view_shape(size, 1);
-      view_shape[i] = shape[i];
-
-      framework::DDim out_dims_reshape = phi::make_ddim(view_shape);
-      phi::DenseTensor reshape_ins_tensor(ins[i]->dtype());
-      reshape_ins_tensor.ShareDataWith(*ins[i]);
-      reshape_ins_tensor.Resize(out_dims_reshape);
-
-      framework::DDim out_dims = phi::make_ddim(shape);
-      outs[i]->Resize(out_dims);
-      outs[i]->mutable_data<T>(context.GetPlace());
-
-      auto stream =
-          context.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(reshape_ins_tensor)
-          .AddInput(std::move(shape))
-          .AddOutput(*(outs[i]))
-          .Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    meshgrid,
-    paddle::operators::MeshgridNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::MeshgridNPUKernel<int64_t>,
-#endif
-    paddle::operators::MeshgridNPUKernel<float>,
-    paddle::operators::MeshgridNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
deleted file mode 100644
index 094f39366ab35..0000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AccuracyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<phi::DenseTensor>("Out");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-
-    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
-    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
-    auto* total = ctx.Output<phi::DenseTensor>("Total");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    int num_samples = inference->dims()[0];
-    if (num_samples == 0) {
-      return;
-    }
-
-    // cast `indices` or `label` if their type is not consistent
-    Tensor cast_indices(phi::DataType::INT32);
-    Tensor cast_label(phi::DataType::INT32);
-    if (indices->dtype() != label->dtype()) {
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-      if (framework::TransToProtoVarType(indices->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_indices.Resize(indices->dims());
-        cast_indices.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_indices =
-            NpuOpRunner("Cast",
-                        {*indices},
-                        {cast_indices},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_indices.Run(stream);
-      } else {
-        cast_indices.ShareDataWith(*indices);
-      }
-      if (framework::TransToProtoVarType(label->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_label.Resize(label->dims());
-        cast_label.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_label =
-            NpuOpRunner("Cast",
-                        {*label},
-                        {cast_label},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_label.Run(stream);
-      } else {
-        cast_label.ShareDataWith(*label);
-      }
-    } else {
-      cast_indices.ShareDataWith(*indices);
-      cast_label.ShareDataWith(*label);
-    }
-
-    // equal
-    Tensor tmp_equal(phi::DataType::BOOL);
-    tmp_equal.Resize(inference->dims());
-    tmp_equal.mutable_data<bool>(ctx.GetPlace());
-    const auto& runner_equal =
-        NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
-    runner_equal.Run(stream);
-
-    // cast equal
-    Tensor tmp_equal_cast(phi::DataType::FLOAT32);
-    tmp_equal_cast.Resize(inference->dims());
-    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_cast_equal = NpuOpRunner(
-        "Cast",
-        {tmp_equal},
-        {tmp_equal_cast},
-        {{"dst_type",
-          static_cast<int>(ConvertToNpuDtype(
-              framework::TransToProtoVarType(tmp_equal_cast.dtype())))}});
-    runner_cast_equal.Run(stream);
-
-    // [correct]
-    // reduce_max
-    Tensor tmp_correct_max(phi::DataType::FLOAT32);
-    tmp_correct_max.Resize(phi::make_ddim({num_samples}));
-    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_max =
-        NpuOpRunner("ReduceMaxD",
-                    {tmp_equal_cast},
-                    {tmp_correct_max},
-                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
-    runner_reduce_max.Run(stream);
-
-    // reduce_sum
-    Tensor tmp_correct(phi::DataType::FLOAT32);
-    tmp_correct.Resize(correct->dims());
-    tmp_correct.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_sum =
-        NpuOpRunner("ReduceSumD",
-                    {tmp_correct_max},
-                    {tmp_correct},
-                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
-    runner_reduce_sum.Run(stream);
-
-    // cast to int
-    correct->mutable_data<int>(ctx.GetPlace());
-    const auto& runner_cast_correct =
-        NpuOpRunner("Cast",
-                    {tmp_correct},
-                    {*correct},
-                    {{"dst_type",
-                      static_cast<int>(ConvertToNpuDtype(
-                          framework::TransToProtoVarType(correct->dtype())))}});
-    runner_cast_correct.Run(stream);
-
-    // [total]
-    total->mutable_data<int>(ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
-
-    // use `total` of type `float32` for calculating accuracy
-    Tensor tmp_total(phi::DataType::FLOAT32);
-    tmp_total.Resize(total->dims());
-    tmp_total.mutable_data<float>(ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&tmp_total,
-                                     static_cast<float>(num_samples));
-
-    // [accuracy]
-    accuracy->mutable_data<float>(ctx.GetPlace());
-    const auto& runner_accuracy =
-        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
-    runner_accuracy.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    accuracy,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
deleted file mode 100644
index d8b713de96fff..0000000000000
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
-      if (x->dims().size() == 2 && y->dims().size() == 2) {
-        out->mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("MatMul",
-                        {*x, *y},
-                        {*out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-
-        runner.Run(stream);
-      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
-        // reshape
-        Tensor tmp_x(x->type());
-        int64_t sec_dim = x->dims()[1];
-        for (auto i = 2; i < x->dims().size(); i++) {
-          sec_dim *= x->dims()[i];
-        }
-        int64_t first_dim = x->dims()[0];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-        out->mutable_data<T>(ctx.GetPlace());
-        // matmul
-        const auto& runner =
-            NpuOpRunner("MatMul",
-                        {tmp_x, *y},
-                        {*out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-        runner.Run(stream);
-      } else {
-        PADDLE_THROW(
-            platform::errors::InvalidArgument("npu error: not support dims"));
-      }
-      // to do other
-    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
-      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
-      PADDLE_ENFORCE_EQ(x_num_col_dims,
-                        2,
-                        platform::errors::InvalidArgument(
-                            "now only support x_num_col_dims == 2: but got %d",
-                            x_num_col_dims));
-      if (framework::TransToProtoVarType(x->dtype()) ==
-              framework::proto::VarType::FP16 &&
-          framework::TransToProtoVarType(y->dtype()) ==
-              framework::proto::VarType::FP16) {
-        // NOTE: When the dim of the input and output shapes is inconsistent,
-        // (Boradcast) BatchMatMul NPU OP only support FP16.
-        out->mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("BatchMatMul",
-                        {*x, *y},
-                        {*out},
-                        {{"adj_x1", false}, {"adj_x2", false}});
-
-        auto stream =
-            ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                .stream();
-        runner.Run(stream);
-      } else {
-        // flatten => x.shape=[6, 4]
-        Tensor tmp_x(x->type());
-        int64_t first_dim = x->dims()[0] * x->dims()[1];
-        int64_t sec_dim = x->dims()[2];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-
-        // matmul [6,4] , [4, 5] => [6, 5]
-        out->mutable_data<T>(ctx.GetPlace());
-
-        Tensor tmp_out(x->type());
-        tmp_out.ShareDataWith(*out);
-        tmp_out.Resize(phi::make_ddim({first_dim, y->dims()[1]}));
-
-        const auto& runner_matmul =
-            NpuOpRunner("MatMul",
-                        {tmp_x, *y},
-                        {tmp_out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-        runner_matmul.Run(stream);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
-      if (x->dims().size() == 2 && y->dims().size() == 2) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul",
-                          {*dout, *y},
-                          {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-
-          runner_dx.Run(stream);
-        }
-
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul",
-                          {*x, *dout},
-                          {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
-
-          runner_dy.Run(stream);
-        }
-      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
-        // flatten => x.shape=[6, 4]
-        // matmul
-        if (dx) {
-          // matmul [2, 5] * [12, 5] => [2, 12]
-          dx->mutable_data<T>(ctx.GetPlace());
-          Tensor tmp_dx(x->type());
-          tmp_dx.ShareDataWith(*dx);
-          tmp_dx.Resize(phi::make_ddim({dout->dims()[0], y->dims()[0]}));
-
-          const auto& runner_matmul =
-              NpuOpRunner("MatMul",
-                          {*dout, *y},
-                          {tmp_dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-          runner_matmul.Run(stream);
-        }
-
-        if (dy) {
-          // flatten
-          Tensor tmp_x(x->type());
-          int64_t sec_dim = x->dims()[1];
-          for (auto i = 2; i < x->dims().size(); i++) {
-            sec_dim *= x->dims()[i];
-          }
-          int64_t first_dim = x->dims()[0];
-          tmp_x.ShareDataWith(*x);
-          tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul",
-                          {tmp_x, *dout},
-                          {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
-
-          runner_dy.Run(stream);
-        }
-      }
-    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
-      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
-      PADDLE_ENFORCE_EQ(x_num_col_dims,
-                        2,
-                        platform::errors::InvalidArgument(
-                            "now only support x_num_col_dims == 2: but got %d",
-                            x_num_col_dims));
-      // tmp_dout both used by dx and dy
-      Tensor tmp_dout(x->type());
-      int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1];
-      int64_t dout_sec_dim = dout->dims()[2];
-      tmp_dout.ShareDataWith(*dout);
-      tmp_dout.Resize(phi::make_ddim({dout_first_dim, dout_sec_dim}));
-
-      if (dx) {
-        // tmp_dout * y [2, 3, 5] * [4,5] => [2, 3, 4]
-        if (framework::TransToProtoVarType(dout->dtype()) ==
-                framework::proto::VarType::FP16 &&
-            framework::TransToProtoVarType(y->dtype()) ==
-                framework::proto::VarType::FP16) {
-          // NOTE: When the dim of the input and output shapes is inconsistent,
-          // (Boradcast) BatchMatMul NPU OP only support FP16.
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner =
-              NpuOpRunner("BatchMatMul",
-                          {*dout, *y},
-                          {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
-
-          auto stream =
-              ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                  .stream();
-          runner.Run(stream);
-        } else {
-          dx->mutable_data<T>(ctx.GetPlace());
-          Tensor tmp_dx(x->type());
-          tmp_dx.ShareDataWith(*dx);
-          tmp_dx.Resize(phi::make_ddim({dout_first_dim, y->dims()[0]}));
-
-          const auto& runner_matmul =
-              NpuOpRunner("MatMul",
-                          {tmp_dout, *y},
-                          {tmp_dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-          runner_matmul.Run(stream);
-        }
-      }
-      if (dy) {
-        // flatten x.shape [2,3,4] => [6, 4]
-        Tensor tmp_x(x->type());
-        int64_t first_dim = x->dims()[0] * x->dims()[1];
-        int64_t sec_dim = x->dims()[2];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-        // mamtul [6,4] [6,5] =>[4,5]
-        dy->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy =
-            NpuOpRunner("MatMul",
-                        {tmp_x, tmp_dout},
-                        {*dy},
-                        {{"transpose_x1", true}, {"transpose_x2", false}});
-        runner_dy.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    mul,
-    ops::MulNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MulNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    mul_grad,
-    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
deleted file mode 100644
index 425b7c6738633..0000000000000
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
-// cmake/operators.cmake when Paddle supports
-#if (CANN_VERSION_CODE >= 504000)
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUMultinomialKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    out->mutable_data<int64_t>(place);
-
-    const auto& runner = NpuOpRunner(
-        "MultinomialWithReplacementD",
-        {*x},
-        {*out},
-        {{"num_samples", num_samples}, {"replacement", replacement}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    multinomial,
-    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, double>)
-#endif
diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc
deleted file mode 100644
index b839b3e8ec2e0..0000000000000
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = framework::DDim;
-
-void CheckAxis(int axis, int rank) {
-  // check the axis is in [-rank, rank-1]
-  if (axis <= rank - 1 && axis >= -rank) return;
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "axis in norm operator must between (%d) and (%d)"
-      "but got (%d).",
-      -rank,
-      rank - 1,
-      axis));
-}
-
-template <typename DeviceContext, typename T>
-class NormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl;
-    auto *in_x = ctx.Input<phi::DenseTensor>("X");
-    auto *out_y = ctx.Output<phi::DenseTensor>("Out");
-    auto *out_norm = ctx.Output<phi::DenseTensor>("Norm");
-    out_y->mutable_data<T>(ctx.GetPlace());
-    out_norm->mutable_data<T>(ctx.GetPlace());
-    auto xdim = in_x->dims();
-    float eps = ctx.Attr<float>("epsilon");
-    int axis = ctx.Attr<int>("axis");
-    CheckAxis(axis, xdim.size());
-    if (axis < 0) axis = xdim.size() + axis;
-
-    framework::NPUAttributeMap attr_input_norm;
-    attr_input_norm["axes"] = std::vector<int>({axis});
-    attr_input_norm["p"] = 2;
-    attr_input_norm["keepdim"] = true;
-    attr_input_norm["epsilon"] = eps;
-    const auto &runner =
-        NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-    NpuOpRunner("Div", {*in_x, *out_norm}, {*out_y}, {}).Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    float epsilon = ctx.Attr<float>("epsilon");
-    int axis = ctx.Attr<int>("axis");
-
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *y = ctx.Input<phi::DenseTensor>("Out");
-    auto *dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto xdim = x->dims();
-    CheckAxis(axis, xdim.size());
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    framework::NPUAttributeMap attr_input_norm;
-    attr_input_norm["dim"] = std::vector<int>({axis});
-    attr_input_norm["eps"] = epsilon;
-    const auto &runner =
-        NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    norm,
-    ops::NormNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NormNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>)
-
-REGISTER_OP_NPU_KERNEL(
-    norm_grad,
-    ops::NormGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::NormGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
deleted file mode 100644
index e44f6286afa9b..0000000000000
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/one_hot_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class OneHotNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int depth = ctx.Attr<int>("depth");
-
-    if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
-      std::vector<int32_t> depth_data;
-      framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
-      depth = depth_data[0];
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-    out->mutable_data<float>(ctx.GetPlace());
-
-    float on_value = 1.0f, off_value = 0.0f;
-    if (framework::TransToProtoVarType(in->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(*in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_in;
-      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(dev_ctx.stream());
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(transformed_in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(one_hot,
-                       ops::OneHotNPUKernel<int32_t>,
-                       ops::OneHotNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
deleted file mode 100644
index b213d3345d1f0..0000000000000
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class OneHotV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int depth = ctx.Attr<int>("depth");
-
-    if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
-      std::vector<int32_t> depth_data;
-      framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-    out->mutable_data<float>(ctx.GetPlace());
-
-    float on_value = 1.0f, off_value = 0.0f;
-    if (framework::TransToProtoVarType(in->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(*in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_in;
-      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(dev_ctx.stream());
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(transformed_in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(one_hot_v2,
-                       ops::OneHotV2NPUKernel<int32_t>,
-                       ops::OneHotV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
deleted file mode 100644
index 3324e56b3b95f..0000000000000
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* param = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Grad(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* grad = ctx.Input<phi::DenseTensor>("Grad");
-    auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
-    auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
-    auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
-
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
-    auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
-    auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
-    auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
-
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    // skip_update=true, just copy input to output, and TensorCopy will call
-    // mutable_data
-    if (skip_update) {
-      VLOG(4) << "Adam skip update";
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-      framework::TensorCopy(
-          *beta1_pow,
-          beta1_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta1_pow_out);
-      framework::TensorCopy(
-          *beta2_pow,
-          beta2_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta2_pow_out);
-      return;
-    }
-
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    mom1_out->mutable_data<T>(ctx.GetPlace());
-    mom2_out->mutable_data<T>(ctx.GetPlace());
-
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
-    // place.
-    phi::DenseTensor beta1_pow_tmp;
-    phi::DenseTensor beta2_pow_tmp;
-    if (beta1_pow->place() == platform::CPUPlace()) {
-      T beta1 = *beta1_pow->data<T>();
-      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
-      beta1_pow = &beta1_pow_tmp;
-    }
-    if (beta2_pow->place() == platform::CPUPlace()) {
-      T beta2 = *beta2_pow->data<T>();
-      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
-      beta2_pow = &beta2_pow_tmp;
-    }
-
-    const phi::DenseTensor* beta1_tensor = nullptr;
-    const phi::DenseTensor* beta2_tensor = nullptr;
-    const phi::DenseTensor* epsilon_tensor = nullptr;
-
-    phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-
-    if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
-      PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta1Tensor) size must be 1, but get %d",
-                            beta1_tensor->numel()));
-    } else {
-      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
-      beta1_tensor = &beta1_tmp;
-    }
-
-    if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta2Tensor) size must be 1, but get %d",
-                            beta2_tensor->numel()));
-    } else {
-      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
-      beta2_tensor = &beta2_tmp;
-    }
-
-    if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
-      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(EpsilonTensor) size must be 1, but get %d",
-                            epsilon_tensor->numel()));
-    } else {
-      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-      epsilon_tensor = &epsilon_tmp;
-    }
-
-    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
-            << "beta2_pow.numel() : " << beta2_pow->numel();
-    VLOG(3) << "param.numel(): " << param->numel();
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ApplyAdamD",
-                                     {
-                                         *param,
-                                         *mom1,
-                                         *mom2,
-                                         *beta1_pow,
-                                         *beta2_pow,
-                                         *lr,
-                                         *beta1_tensor,
-                                         *beta2_tensor,
-                                         *epsilon_tensor,
-                                         *grad,
-                                     },
-                                     {
-                                         *param_out,
-                                         *mom1_out,
-                                         *mom2_out,
-                                     },
-                                     {});
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param->data<T>()) {
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-    if (mom1_out->data<T>() != mom1->data<T>()) {
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-    }
-    if (mom2_out->data<T>() != mom2->data<T>()) {
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-    }
-    if (!use_global_beta_pow) {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner_m1 =
-          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-      runner_m1.Run(stream);
-      const auto& runner_m2 =
-          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-      runner_m2.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(3) << "NPU AdamW Kernel";
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    VLOG(3) << "Skip update" << skip_update;
-    bool with_decay = ctx.Attr<bool>("with_decay");
-    if (!skip_update && with_decay) {
-      float coeff = ctx.Attr<float>("coeff");
-      auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-      auto place = ctx.GetPlace();
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-
-      phi::DenseTensor one(phi::DataType::FLOAT32);
-      phi::DenseTensor decay(phi::DataType::FLOAT32);
-      phi::DenseTensor tmp(phi::DataType::FLOAT32);
-
-      tmp.mutable_data<float>({1}, place);
-      one.mutable_data<float>({1}, place);
-      decay.mutable_data<float>({1}, place);
-
-      FillNpuTensorWithConstant<float>(&one, 1.0f);
-      framework::NPUAttributeMap attr_input = {{"value", coeff}};
-
-      const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input);
-      runner1.Run(stream);
-
-      const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {});
-      runner2.Run(stream);
-
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Parma is not supported on npu"));
-      } else {
-        auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-        param_out->mutable_data<T>(ctx.GetPlace());
-
-        const auto* param_var = ctx.InputVar("Param");
-        PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "The Var(%s)'s type should be phi::DenseTensor, "
-                              "but the received is %s",
-                              ctx.InputNames("Param").front(),
-                              framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<phi::DenseTensor>("Param");
-
-        const auto& runner =
-            NpuOpRunner("Mul",
-                        {*param, decay},
-                        {*const_cast<phi::DenseTensor*>(param)},
-                        {});
-        runner.Run(stream);
-      }
-    }
-    AdamNPUKernel<platform::NPUDeviceContext, T>::Compute(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    adam,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(adamw,
-                       ops::AdamWNPUKernel<float>,
-                       ops::AdamWNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
deleted file mode 100644
index 83c805a1f642a..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    size_t n = params.size();
-    PADDLE_ENFORCE_EQ(n,
-                      params_out.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          params_out.size(),
-                          n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(params[i],
-                        params_out[i],
-                        platform::errors::InvalidArgument(
-                            "The size of Input(Param) and Output(ParamOut) "
-                            "must be the same Tensors."));
-    }
-
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    PADDLE_ENFORCE_EQ(
-        n,
-        grads.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(Grad) must be equal to Input(Param), but got "
-            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
-            grads.size(),
-            n));
-
-    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    PADDLE_ENFORCE_EQ(n,
-                      velocitys.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Velocity) must be equal to "
-                          "Input(Param), but got the size of Input(Velocity) "
-                          "is %d, the size of Input(Param) is %d.",
-                          velocitys.size(),
-                          n));
-
-    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    PADDLE_ENFORCE_EQ(
-        n,
-        velocitys_out.size(),
-        platform::errors::InvalidArgument(
-            "The size of Output(VelocityOut) must be "
-            "equal to Input(Param), but got the size of Output(VelocityOut) is "
-            "%d, the size of Input(Param) is %d.",
-            velocitys_out.size(),
-            n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(velocitys[i],
-                        velocitys_out[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Velocity) and Output(VelocityOut) must be "
-                            "the same Tensors."));
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    if (lrs.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          lrs.size(),
-          platform::errors::InvalidArgument(
-              "If the size of Input(LearningRate) is not 1, the size of "
-              "Input(LearningRate) must be "
-              "equal to Input(Param), but got the size of Input(LearningRate) "
-              "is %d, the size of Input(Param) is %d.",
-              lrs.size(),
-              n));
-    }
-    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto regularization_methods =
-        ctx.Attr<std::vector<std::string>>("regularization_method");
-    auto regularization_coeffs =
-        ctx.Attr<std::vector<float>>("regularization_coeff");
-    if (regularization_methods.size() != 0) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_methods.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_method) must be equal "
-              "to Input(Param), but got the size of "
-              "Attr(regularization_method) is %d, the size of Input(Param) is "
-              "%d.",
-              regularization_methods.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_coeffs.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_coeff) must be equal "
-              "to Input(Param), but got the size of Attr(regularization_coeff) "
-              "is %d, the size of Input(Param) is %d.",
-              regularization_coeffs.size(),
-              n));
-    }
-
-    VLOG(5) << "use_nesterov: " << use_nesterov
-            << ",  regularization_methods.size(): "
-            << regularization_methods.size()
-            << ",  regularization_coeffs.size(): "
-            << regularization_coeffs.size();
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    Tensor mu_tensor;
-    mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-    for (size_t idx = 0; idx < n; ++idx) {
-      phi::RegularizationType regularization_flag =
-          regularization_methods.size() > 0 &&
-                  regularization_methods[idx] == "l2_decay"
-              ? phi::RegularizationType::kL2DECAY
-              : phi::RegularizationType::kNONE;
-      float regularization_coeff = 0.0;
-      if (regularization_coeffs.size() != 0) {
-        regularization_coeff = regularization_coeffs[idx];
-      }
-
-      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
-      auto param = params[idx];
-      auto param_out = params_out[idx];
-      auto velocity = velocitys[idx];
-      auto velocity_out = velocitys_out[idx];
-
-      auto grad = grads[idx];
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(merged_momentum,
-                       ops::NPUMergedMomentumOpKernel<float>,
-                       ops::NPUMergedMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
deleted file mode 100644
index a5349e05b9b02..0000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    std::string regularization_method =
-        ctx.Attr<std::string>("regularization_method");
-    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    phi::RegularizationType regularization_flag{
-        phi::RegularizationType::kNONE};  // disable regularization
-    if (regularization_method == "l2_decay") {
-      regularization_flag = phi::RegularizationType::kL2DECAY;
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto param = ctx.Input<phi::DenseTensor>("Param");
-    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
-
-    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      Tensor mu_tensor;
-      mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(
-          false,
-          true,
-          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in MomentumOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(momentum,
-                       ops::NPUMomentumOpKernel<float>,
-                       ops::NPUMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
deleted file mode 100644
index 6ee01272f47e8..0000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RMSPROPNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto *moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<phi::DenseTensor>("MeanSquareOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
-    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-    auto *p_tensor = ctx.Input<phi::DenseTensor>("Param");
-    auto *ms_tensor = ctx.Input<phi::DenseTensor>("MeanSquare");
-    auto *lr_tensor = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto *mom_tensor = ctx.Input<phi::DenseTensor>("Moment");
-    bool centered = ctx.Attr<bool>("centered");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
-      if (centered) {
-        framework::NPUAttributeMap attr_input = {{"use_locking", false}};
-        const phi::DenseTensor *rho_tensor = nullptr;
-        const phi::DenseTensor *momentum_tensor = nullptr;
-        const phi::DenseTensor *epsilon_tensor = nullptr;
-        phi::DenseTensor rho_tmp(phi::DataType::FLOAT32);
-        rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&rho_tmp, rho);
-        rho_tensor = &rho_tmp;
-        phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32);
-        momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
-        momentum_tensor = &momentum_tmp;
-        phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-        epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-        epsilon_tensor = &epsilon_tmp;
-        auto *mg_tensor = ctx.Input<phi::DenseTensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<phi::DenseTensor>("MeanGradOut");
-        mean_grad_out->mutable_data<T>(ctx.GetPlace());
-        const auto &runner_applycenterrmsprop = NpuOpRunner(
-            std::string("ApplyCenteredRMSPropD"),
-            {*p_tensor,
-             *mg_tensor,
-             *ms_tensor,
-             *mom_tensor,
-             *lr_tensor,
-             *rho_tensor,
-             *momentum_tensor,
-             *epsilon_tensor,
-             *grad_tensor},
-            {*param_out, *mean_grad_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applycenterrmsprop.Run(stream);
-      } else {
-        framework::NPUAttributeMap attr_input = {
-            {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}};
-        const auto &runner_applyrmsprop = NpuOpRunner(
-            std::string("ApplyRMSPropD"),
-            {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor},
-            {*param_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applyrmsprop.Run(stream);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in RmspropOp. Excepted LodTensor, "
-                            "But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    rmsprop, ops::RMSPROPNPUKernel<paddle::platform::NPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
deleted file mode 100644
index 7bd5cf8793cd0..0000000000000
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SGDNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto* param_var = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.Input<phi::DenseTensor>("Grad");
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ApplyGradientDescent",
-                                     {*param_var, *learning_rate, *grad_var},
-                                     {*param_out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param_var->data<T>()) {
-      framework::TensorCopy(
-          *param_var,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    sgd,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc
deleted file mode 100644
index c2d99fa42f2f8..0000000000000
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PnormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<phi::DenseTensor>("X");
-    auto* out_norm = ctx.Output<phi::DenseTensor>("Out");
-    out_norm->mutable_data<T>(ctx.GetPlace());
-
-    float porder = ctx.Attr<float>("porder");
-    int axis = ctx.Attr<int>("axis");
-    bool keepdim = ctx.Attr<bool>("keepdim");
-
-    auto xdim = in_x->dims();
-    if (axis < 0) axis = xdim.size() + axis;
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    int p = 0;
-    bool combine_op =
-        !(porder == 0 || porder == INFINITY || porder == -INFINITY);
-    if (porder == INFINITY) {
-      p = INT_MAX;
-    } else if (porder == -INFINITY) {
-      p = INT_MIN;
-    } else {
-      p = static_cast<int>(porder);
-      float t = 0;
-      float diff = abs(std::modf(porder, &t));
-      if (diff < 1e-5) {
-        combine_op = false;
-      }
-    }
-
-    if (!combine_op) {
-      const auto& runner = NpuOpRunner("LpNorm",
-                                       {*in_x},
-                                       {*out_norm},
-                                       {{"p", p},
-                                        {"axes", std::vector<int32_t>({axis})},
-                                        {"keep_dims", keepdim}});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor tmp_x;
-      tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
-
-      const auto& power_runner1 =
-          NpuOpRunner("Power",
-                      {*in_x},
-                      {tmp_x},
-                      {{"power", porder}, {"scale", 1.0f}, {"shift", 0.0f}});
-      power_runner1.Run(stream);
-
-      const auto& reduce_runner = NpuOpRunner(
-          "ReduceSumD",
-          {tmp_x},
-          {*out_norm},
-          {{"axes", std::vector<int32_t>({axis})}, {"keep_dims", keepdim}});
-      reduce_runner.Run(stream);
-
-      const auto& power_runner2 = NpuOpRunner(
-          "Power",
-          {*out_norm},
-          {*out_norm},
-          {{"power", 1 / porder}, {"scale", 1.0f}, {"shift", 0.0f}});
-      power_runner2.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PnormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Out");
-    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-
-    auto xdim = x->dims();
-    float porder = ctx.Attr<float>("porder");
-    bool keepdim = ctx.Attr<bool>("keepdim");
-
-    int axis = ctx.Attr<int>("axis");
-    axis = axis < 0 ? xdim.size() + axis : axis;
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor y_share(y->type());
-    phi::DenseTensor dy_share(dy->type());
-    y_share.ShareDataWith(*y);
-    dy_share.ShareDataWith(*dy);
-    auto ydim = xdim;
-    if (!keepdim) {
-      ydim[axis] = 1;
-    } else {
-      ydim = y->dims();
-    }
-    y_share.Resize(ydim);
-    dy_share.Resize(ydim);
-
-    if (porder == 0) {
-      FillNpuTensorWithConstant(dx, static_cast<T>(0));
-      dx->Resize(xdim);
-    } else if (porder == INFINITY || porder == -INFINITY) {
-      phi::DenseTensor x_abs;
-      x_abs.mutable_data<T>(xdim, place);
-      const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
-      r_abs.Run(stream);
-
-      phi::DenseTensor t_cond;
-      t_cond.mutable_data<bool>(xdim, place);
-      const auto& r_equal =
-          NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {});
-      r_equal.Run(stream);
-
-      phi::DenseTensor t_zero;
-      t_zero.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
-
-      phi::DenseTensor x_sign;
-      x_sign.mutable_data<T>(xdim, place);
-      const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
-      r_sign.Run(stream);
-
-      const auto& r_mul = NpuOpRunner("Mul", {x_sign, dy_share}, {*dx}, {});
-      r_mul.Run(stream);
-
-      const auto& r_sel =
-          NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {});
-      r_sel.Run(stream);
-    } else {
-      phi::DenseTensor x_abs;
-      x_abs.mutable_data<T>(xdim, place);
-      const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
-      r_abs.Run(stream);
-
-      phi::DenseTensor x_sign;
-      x_sign.mutable_data<T>(xdim, place);
-      const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
-      r_sign.Run(stream);
-
-      phi::DenseTensor y_pow;
-      y_pow.mutable_data<T>(ydim, place);
-      if (porder >= 1) {
-        const auto& r_pow1 = NpuOpRunner(
-            "Power",
-            {x_abs},
-            {x_abs},
-            {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow1.Run(stream);
-
-        const auto& r_pow2 = NpuOpRunner(
-            "Power",
-            {y_share},
-            {y_pow},
-            {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow2.Run(stream);
-
-        const auto& r_div = NpuOpRunner("DivNoNan", {x_abs, y_pow}, {*dx}, {});
-        r_div.Run(stream);
-      } else {
-        const auto& r_pow1 = NpuOpRunner(
-            "Power",
-            {x_abs},
-            {x_abs},
-            {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow1.Run(stream);
-
-        const auto& r_pow2 = NpuOpRunner(
-            "Power",
-            {y_share},
-            {y_pow},
-            {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow2.Run(stream);
-
-        const auto& r_div = NpuOpRunner("DivNoNan", {y_pow, x_abs}, {*dx}, {});
-        r_div.Run(stream);
-      }
-
-      const auto& r_mul1 = NpuOpRunner("Mul", {*dx, x_sign}, {*dx}, {});
-      r_mul1.Run(stream);
-
-      const auto& r_mul2 = NpuOpRunner("Mul", {*dx, dy_share}, {*dx}, {});
-      r_mul2.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    p_norm,
-    ops::PnormNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PnormNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    p_norm_grad,
-    ops::PnormGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PnormGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
deleted file mode 100644
index 0f45d0b51c837..0000000000000
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
-  if (paddings_t) {
-    paddle::framework::TensorToVector(
-        *paddings_t, context.device_context(), &paddings);
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto in_dims = x->dims();
-
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    float value = context.Attr<float>("value");
-    auto data_format = context.Attr<std::string>("data_format");
-
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    PADDLE_ENFORCE_LT(abs(value),
-                      1e-5,
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support constant_values=0 right now,"
-                          "but received constant_value is %f .",
-                          value));
-
-    PADDLE_ENFORCE_EQ(mode,
-                      "constant",
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support mode=constant right now,"
-                          "but received mode is %s .",
-                          mode));
-
-    std::vector<int> paddings(
-        {0, 0, 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1]});
-    if (data_format == "NCDHW") {
-      out->Resize({in_dims[0],
-                   in_dims[1],
-                   in_dims[2] + pads[4] + pads[5],
-                   in_dims[3] + pads[2] + pads[3],
-                   in_dims[4] + pads[0] + pads[1]});
-    } else {
-      out->Resize({in_dims[0],
-                   in_dims[1] + pads[4] + pads[5],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3] + pads[0] + pads[1],
-                   in_dims[4]});
-      paddings = {
-          0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1], 0, 0};
-    }
-    out->mutable_data<T>(context.GetPlace());
-
-    NpuOpRunner runner;
-    runner.SetType("PadV3")
-        .AddInput(*x)
-        .AddInput(std::move(paddings))
-        .AddInput(
-            std::vector<int>({0}))  // npu only support constant_value=0 now
-        .AddOutput(*out)
-        .AddAttr("mode", mode);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Pad3dGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    d_in->mutable_data<T>(context.GetPlace());
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int64_t> size(
-        {d_in_dims[0], d_in_dims[1], d_in_dims[2], d_in_dims[3], d_in_dims[4]});
-    if (mode == "constant") {  // this method can be only used for constant mode
-      std::vector<int> offsets({0, 0, pad_front, pad_top, pad_left});
-      if (data_format == "NDHWC") {
-        offsets = {0, pad_front, pad_top, pad_left, 0};
-      }
-      const auto& runner = NpuOpRunner(
-          "SliceD", {*d_out}, {*d_in}, {{"offsets", offsets}, {"size", size}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(pad3d,
-                       ops::Pad3dNPUKernel<plat::float16>,
-                       ops::Pad3dNPUKernel<float>,
-                       ops::Pad3dNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(pad3d_grad,
-                       ops::Pad3dNPUKernel<plat::float16>,
-                       ops::Pad3dGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
deleted file mode 100644
index 48c2254b1ec91..0000000000000
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PadNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto paddings = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-
-    PADDLE_ENFORCE_LT(abs(pad_value),
-                      1e-5,
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support pad_value=0 right now,"
-                          "but received pad_value is %f .",
-                          pad_value));
-
-    out->mutable_data<T>(context.GetPlace());
-
-    NpuOpRunner runner;
-    runner.SetType("Pad")
-        .AddInput(*x)
-        .AddInput(std::move(paddings))
-        .AddOutput(*out);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class PadGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto paddings = context.Attr<std::vector<int>>("paddings");
-
-    d_x->mutable_data<T>(context.GetPlace());
-
-    auto d_x_dims = d_x->dims();
-    auto size = phi::vectorize(d_x_dims);
-    std::vector<int> offsets(0);
-    int i = 0;
-    for (auto iter = paddings.begin(); iter < paddings.end(); ++iter, ++i) {
-      if (i % 2 == 0) {
-        offsets.push_back(*iter);
-      }
-    }
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner(
-        "SliceD", {*d_out}, {*d_x}, {{"offsets", offsets}, {"size", size}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(pad,
-                       ops::PadNPUKernel<plat::float16>,
-                       ops::PadNPUKernel<float>,
-                       ops::PadNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(pad_grad,
-                       ops::PadGradNPUKernel<plat::float16>,
-                       ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
deleted file mode 100644
index e14c55a63642a..0000000000000
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    auto in_x_dims = in_x->dims();
-    auto out_dims = out->dims();
-    framework::DDim data_dims;
-    framework::DDim out_data_dims;
-
-    Tensor in_x_tensor, out_tensor;
-    in_x_tensor.ShareDataWith(*in_x);
-    out_tensor.ShareDataWith(*out);
-    std::vector<int> ksize_vec(4, 1);
-    std::vector<int> strides_vec(4, 1);
-
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-      out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1);
-      ksize_vec[1] = ksize[0];
-      ksize_vec[2] = ksize[1];
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      in_x_tensor.set_layout(DataLayout::kNHWC);
-      out_tensor.set_layout(DataLayout::kNHWC);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-      out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size());
-      ksize_vec[2] = ksize[0];
-      ksize_vec[3] = ksize[1];
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-#if (CANN_VERSION_CODE < 512000)
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[0], paddings[1]),
-        ksize[0],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
-            ksize[0],
-            std::max(paddings[0], paddings[1])));
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[2], paddings[3]),
-        ksize[1],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
-            ksize[1],
-            std::max(paddings[2], paddings[3])));
-#endif
-    if (adaptive) {
-      std::string pooling_mode = "AdaptiveAvgPool2d";
-      if (pooling_type == "max") {
-        pooling_mode = "AdaptiveMaxPool2d";
-      }
-
-      // AdaptiveAvgPool2d only support NCHW
-      Tensor transformed_input, transformed_output;
-      if (pooling_type == "avg" && channel_last) {
-        transformed_input.mutable_data<T>(
-            phi::make_dim(
-                in_x_dims[0], in_x_dims[3], in_x_dims[1], in_x_dims[2]),
-            ctx.GetPlace());
-        transformed_output.mutable_data<T>(
-            phi::make_dim(out_dims[0], out_dims[3], out_dims[1], out_dims[2]),
-            ctx.GetPlace());
-
-        const auto &trans_runner =
-            NpuOpRunner("TransData",
-                        {in_x_tensor},
-                        {transformed_input},
-                        {{"src_format", std::string("NHWC")},
-                         {"dst_format", std::string("NCHW")}});
-        trans_runner.Run(dev_ctx.stream());
-      } else {
-        transformed_input.ShareDataWith(in_x_tensor);
-        transformed_output.ShareDataWith(out_tensor);
-      }
-
-      const auto &runner =
-          NpuOpRunner(pooling_mode,
-                      {transformed_input},
-                      {transformed_output},
-                      {{"output_size", phi::vectorize<int>(out_data_dims)}});
-      runner.Run(dev_ctx.stream());
-
-      if (pooling_type == "avg" && channel_last) {
-        const auto &trans_runner =
-            NpuOpRunner("TransData",
-                        {transformed_output},
-                        {out_tensor},
-                        {{"src_format", std::string("NCHW")},
-                         {"dst_format", std::string("NHWC")}});
-        trans_runner.Run(dev_ctx.stream());
-      }
-    } else {
-      std::string pooling_mode = "AvgPoolV2";
-      if (pooling_type == "max") {
-        PADDLE_ENFORCE_EQ(
-            exclusive,
-            true,
-            platform::errors::InvalidArgument(
-                "MaxPool only support exclusive=false, but got true"));
-        pooling_mode = "MaxPoolV3";
-      }
-
-      const auto &runner =
-          NpuOpRunner(pooling_mode,
-                      {in_x_tensor},
-                      {out_tensor},
-                      {{"ksize", ksize_vec},
-                       {"strides", strides_vec},
-                       {"padding_mode", std::string("CALCULATED")},
-                       {"pads", paddings},
-                       {"data_format", data_format},
-                       {"global_pooling", global_pooling},
-                       {"ceil_mode", ceil_mode},
-                       {"exclusive", exclusive}});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename T>
-class NPUPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
-    const Tensor *out_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    auto out_dims = out->dims();
-    framework::DDim data_dims;
-    framework::DDim out_data_dims;
-    std::vector<int> ksize_vec(4, 1);
-    std::vector<int> strides_vec(4, 1);
-
-    Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor;
-    in_x_tensor.ShareDataWith(*in_x);
-    out_tensor.ShareDataWith(*out);
-    out_grad_tensor.ShareDataWith(*out_grad);
-    in_x_grad_tensor.ShareDataWith(*in_x_grad);
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-      out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1);
-      ksize_vec[1] = ksize[0];
-      ksize_vec[2] = ksize[1];
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      in_x_tensor.set_layout(DataLayout::kNHWC);
-      out_tensor.set_layout(DataLayout::kNHWC);
-      out_grad_tensor.set_layout(DataLayout::kNHWC);
-      in_x_grad_tensor.set_layout(DataLayout::kNHWC);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-      out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size());
-      ksize_vec[2] = ksize[0];
-      ksize_vec[3] = ksize[1];
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-#if (CANN_VERSION_CODE < 512000)
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[0], paddings[1]),
-        ksize[0],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
-            ksize[0],
-            std::max(paddings[0], paddings[1])));
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[2], paddings[3]),
-        ksize[1],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
-            ksize[1],
-            std::max(paddings[2], paddings[3])));
-#endif
-    if (adaptive || (global_pooling && pooling_type == "max")) {
-      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, H and W must be divisible, "
-                            "but input dims is %s, output dims is %s",
-                            data_dims,
-                            out_data_dims));
-      PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, H and W must be divisible, "
-                            "but input dims is %s, output dims is %s",
-                            data_dims,
-                            out_data_dims));
-      if (channel_last) {
-        strides_vec[1] = data_dims[0] / out_data_dims[0];
-        strides_vec[2] = data_dims[1] / out_data_dims[1];
-        ksize_vec[1] = strides_vec[1];
-        ksize_vec[2] = strides_vec[2];
-      } else {
-        strides_vec[2] = data_dims[0] / out_data_dims[0];
-        strides_vec[3] = data_dims[1] / out_data_dims[1];
-        ksize_vec[2] = strides_vec[2];
-        ksize_vec[3] = strides_vec[3];
-      }
-    }
-
-    NPUAttributeMap attrs = {{"ksize", ksize_vec},
-                             {"strides", strides_vec},
-                             {"padding_mode", std::string("CALCULATED")},
-                             {"pads", paddings},
-                             {"data_format", data_format},
-                             {"global_pooling", global_pooling},
-                             {"ceil_mode", ceil_mode},
-                             {"exclusive", exclusive}};
-
-    if (pooling_type == "max") {
-      if (global_pooling) {
-        for (auto &s : strides_vec) {
-          s = 1;
-        }
-        PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]),
-                          255,
-                          platform::errors::InvalidArgument(
-                              "MaxPoolGrad H, W must be less than 255 when "
-                              "global_pooling = True, but got %s",
-                              data_dims));
-        attrs["global_pooling"] = false;
-      }
-
-      const auto &runner =
-          NpuOpRunner("MaxPoolV3Grad",
-                      {in_x_tensor, out_tensor, out_grad_tensor},
-                      {in_x_grad_tensor},
-                      attrs);  // 0: floor, 1: ceil
-      runner.Run(dev_ctx.stream());
-    } else if (pooling_type == "avg") {
-      PADDLE_ENFORCE(strides[0] == strides[1],
-                     platform::errors::InvalidArgument(
-                         "AvgPoolGrad dose not support Asymmetric strides. but "
-                         "strides = (%d, %d)",
-                         strides[0],
-                         strides[1]));
-
-      NpuOpRunner runner;
-      runner.SetType("AvgPoolV2Grad");
-      runner.AddInput(phi::vectorize<int>(in_x->dims()));
-      runner.AddInput(out_grad_tensor);
-      runner.AddOutput(in_x_grad_tensor);
-      runner.AddAttrs(attrs);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(pool2d,
-                       ops::NPUPoolOpKernel<float>,
-                       ops::NPUPoolOpKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(pool2d_grad,
-                       ops::NPUPoolGradOpKernel<float>,
-                       ops::NPUPoolGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc
deleted file mode 100644
index fd03ce027bda5..0000000000000
--- a/paddle/fluid/operators/randperm_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/randperm_op.h"
-
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::NPUDeviceContext, T>;
-
-REGISTER_OP_NPU_KERNEL(
-    randperm, kernel<int64_t>, kernel<int>, kernel<float>, kernel<double>);
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
deleted file mode 100644
index b2266608d7dca..0000000000000
--- a/paddle/fluid/operators/range_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/range_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class RangeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<phi::DenseTensor>("Start");
-    auto* end_t = context.Input<phi::DenseTensor>("End");
-    auto* step_t = context.Input<phi::DenseTensor>("Step");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    phi::DenseTensor n;
-    framework::TensorCopy(
-        *start_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T start = n.data<T>()[0];
-    framework::TensorCopy(
-        *end_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T end = n.data<T>()[0];
-    framework::TensorCopy(
-        *step_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T step = n.data<T>()[0];
-
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-
-    out->Resize(phi::make_ddim({size}));
-    out->mutable_data<T>(context.GetPlace());
-
-    std::vector<T> odata;
-    T value = start;
-    for (int64_t i = 0; i < size; ++i) {
-      odata.push_back(value);
-      value += step;
-    }
-
-    framework::TensorFromVector(odata, context.device_context(), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(range,
-                       paddle::operators::RangeNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       paddle::operators::RangeNPUKernel<int64_t>,
-#endif
-                       paddle::operators::RangeNPUKernel<float>,
-                       paddle::operators::RangeNPUKernel<double>)
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
deleted file mode 100644
index 068d5d6be12cd..0000000000000
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(range);
-USE_OP_DEVICE_KERNEL(range, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto start = scope->Var("Start");
-  auto tensor_start = start->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_start;
-  init_start.push_back(static_cast<T>(1));
-  paddle::framework::TensorFromVector(init_start, ctx, tensor_start);
-  tensor_start->Resize({1});
-
-  auto end = scope->Var("End");
-  auto tensor_end = end->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_end;
-  init_end.push_back(static_cast<T>(10));
-  paddle::framework::TensorFromVector(init_end, ctx, tensor_end);
-  tensor_end->Resize({1});
-
-  auto step = scope->Var("Step");
-  auto tensor_step = step->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_step;
-  init_step.push_back(static_cast<T>(2));
-  paddle::framework::TensorFromVector(init_step, ctx, tensor_step);
-  tensor_step->Resize({1});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
-      {{"Out", {"Out"}}},
-      {});
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
-  EXPECT_EQ(static_cast<T>(out_vec[0]), static_cast<T>(1.0));
-  EXPECT_EQ(static_cast<T>(out_vec[1]), static_cast<T>(3.0));
-  EXPECT_EQ(static_cast<T>(out_vec[2]), static_cast<T>(5.0));
-  EXPECT_EQ(static_cast<T>(out_vec[3]), static_cast<T>(7.0));
-  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
-}
-
-TEST(range, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<int>(&scope, *ctx, "range");
-}
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 7c2f91999e964..27a2ff68d3aad 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -33,10 +33,3 @@ if(WITH_ROCM)
     SRCS check_reduce_rank_test.cu
     DEPS tensor)
 endif()
-
-if(WITH_ASCEND_CL)
-  cc_test(
-    reduce_any_op_npu_test
-    SRCS reduce_any_op_npu_test.cc
-    DEPS op_registry reduce_any_op scope device_context enforce executor)
-endif()
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
deleted file mode 100644
index 7ec3183d412d4..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceAnyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // set attr
-    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
-
-    const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
deleted file mode 100644
index aec1640181bcc..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(reduce_any);
-USE_OP_DEVICE_KERNEL(reduce_any, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<bool> init_x = {true, false, false, false};
-  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({2}));
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  std::vector<int> axes;
-  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
-  auto op = f::OpRegistry::CreateOp(
-      "reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  std::vector<bool> out_vec;
-  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  std::vector<bool> expected_vec = {true};
-  EXPECT_EQ(out_vec.size(), expected_vec.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], expected_vec[i]);
-  }
-}
-
-TEST(reduce_any, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<bool>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
deleted file mode 100644
index de4049c7e7f97..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Input<phi::DenseTensor>("Out");
-    auto* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    int in_dtype = context.Attr<int>("in_dtype");
-
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "NPU only support in_dtype == -1 in reduce_max_grad op."));
-
-    auto* x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto place = context.GetPlace();
-    auto stream = dev_ctx.stream();
-
-    // broadcast
-    auto x_dims_vec = phi::vectorize(x->dims());
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-
-    phi::DenseTensor tmp_out, tmp_out_grad;
-    auto tmp_out_dims_vec = x_dims_vec;
-    for (auto d : reduce_dims) {
-      if (d < 0) {
-        d += x_dims_vec.size();
-      }
-      tmp_out_dims_vec[d] = 1;
-    }
-
-    tmp_out.ShareDataWith(*out);
-    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
-    tmp_out_grad.ShareDataWith(*out_grad);
-    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
-
-    phi::DenseTensor transformed_out(x->type());
-    transformed_out.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out.mutable_data<T>(place);
-    NpuOpRunner r_brd_out;
-    r_brd_out.SetType("BroadcastTo")
-        .AddInput(tmp_out)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out)
-        .Run(stream);
-    phi::DenseTensor transformed_out_grad(x->type());
-    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out_grad.mutable_data<T>(place);
-    NpuOpRunner r_brd_out_grad;
-    r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(tmp_out_grad)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out_grad)
-        .Run(stream);
-
-    // compare
-    phi::DenseTensor equal_cond;
-    equal_cond.mutable_data<bool>(x_grad->dims(), place);
-    const auto& r_equal =
-        NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
-    r_equal.Run(stream);
-
-    // select
-    phi::DenseTensor t_zero;
-    t_zero.mutable_data<T>(x_grad->dims(), place);
-    FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
-    t_zero.Resize(x_grad->dims());
-
-    const auto& r_sel = NpuOpRunner(
-        "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
-    r_sel.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_max,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_max_grad,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
deleted file mode 100644
index 65fabbd21cb7e..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUReduceMeanOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-
-    auto input_dims = input->dims();
-    if (reduce_all) {
-      dims.clear();
-      for (int i = 0; i < input_dims.size(); i++) {
-        dims.push_back(static_cast<int>(i));
-      }
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.SetType("ReduceMean")
-        .AddInput(*input)
-        .AddInput(std::move(dims))
-        .AddOutput(*output)
-        .AddAttrs({{"keep_dims", keep_dim}})
-        .Run(stream);
-  }
-};
-
-template <typename T>
-class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    auto input_dims = input->dims();
-
-    int reduce_numel = 1;
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (int d = 0; d < input_dims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + input_dims.size();
-      }
-      reduce_numel *= input_dims[d];
-    }
-
-    phi::DenseTensor tensor_value(input_grad->dtype());
-    tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(
-        &tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(input_dims))
-        .AddInput(tensor_value)
-        .AddOutput(*input_grad)
-        .Run(stream);
-
-    phi::DenseTensor transformed_input_grad, transformed_out_grad;
-    phi::DenseTensor tmp_output_grad;
-    auto tmp_output_dims = input_dims;
-    for (auto d : reduce_dims) {
-      tmp_output_dims[d] = 1;
-    }
-    tmp_output_grad.ShareDataWith(*output_grad);
-    tmp_output_grad.Resize(tmp_output_dims);
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    NpuElementWiseOpBroadcast<T>(dev_ctx,
-                                 input_grad,
-                                 &tmp_output_grad,
-                                 0,
-                                 &transformed_input_grad,
-                                 &transformed_out_grad);
-    const auto& runner2 =
-        NpuOpRunner("Mul",
-                    {transformed_input_grad, transformed_out_grad},
-                    {*input_grad},
-                    {});
-    runner2.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel<float>);
-REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel<float>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
deleted file mode 100644
index e4adc42283120..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (x->dtype() == phi::DataType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_min,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, plat::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
deleted file mode 100644
index fd9bf28b60793..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceProdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input);
-    runner.Run(stream);
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_prod,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
deleted file mode 100644
index 0c6665494ece7..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // special case
-    if (x->dims().size() == 1 && keep_dims == false) {
-      keep_dims = true;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor cast_x;
-    phi::DenseTensor cast_out;
-    // NOTE: ReduceSumD only supports fp32 and fp16
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      cast_x.Resize(x->dims());
-      cast_x.mutable_data<float>(ctx.GetPlace());
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
-      const auto& runner_cast = NpuOpRunner(
-          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-
-      cast_out.Resize(out->dims());
-      cast_out.mutable_data<float>(ctx.GetPlace());
-    } else {
-      cast_x.ShareDataWith(*x);
-      cast_out.ShareDataWith(*out);
-    }
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dim_vec}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dims}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype()));
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (keep_dims || reduce_all) {
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {*out_grad},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    } else {
-      framework::DDim out_dims;
-      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
-          dims, out_grad->dims());
-
-      phi::DenseTensor out_grad_tmp(out_grad->type());
-      out_grad_tmp.Resize(out_dims);
-      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopy(
-          *out_grad,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &out_grad_tmp);
-      out_grad_tmp.Resize(out_dims);
-
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {out_grad_tmp},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum_grad,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
deleted file mode 100644
index 2d4497a19e77b..0000000000000
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class Reshape2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<int32_t> target_shape_vector;
-    auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
-    if (shape_tensor_vector.size() > 0) {
-      for (auto* shape_tensor : shape_tensor_vector) {
-        PADDLE_ENFORCE_EQ(
-            shape_tensor->dims().size(),
-            1,
-            platform::errors::InvalidArgument(
-                "If the element type of 'shape' in Reshape Op is Tensor, "
-                "the element's shape must be [1]. But received the element's "
-                "shape is [%d]",
-                shape_tensor->dims().size()));
-
-        target_shape_vector.push_back(
-            phi::GetVectorFromTensor<int>(shape_tensor)[0]);
-      }
-    } else {
-      auto* shape_tensor = ctx.HasInput("Shape")
-                               ? ctx.Input<phi::DenseTensor>("Shape")
-                               : nullptr;
-      if (shape_tensor) {
-        target_shape_vector = phi::GetVectorFromTensor<int>(shape_tensor);
-      } else {
-        target_shape_vector = ctx.Attr<std::vector<int>>("shape");
-        PADDLE_ENFORCE_GT(
-            target_shape_vector.size(),
-            0,
-            platform::errors::InvalidArgument(
-                "The length of shape attribute should be larger than 0 when "
-                "input ShapeTensor and Shape are empty!"));
-      }
-    }
-
-    int num_negative =
-        std::count(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    PADDLE_ENFORCE_LE(
-        num_negative,
-        1,
-        platform::errors::InvalidArgument(
-            "The max number of -1 in shape attribute or shape tensor is 1 "
-            "but received %d.",
-            num_negative));
-    auto it_zero =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), 0);
-    if (it_zero != target_shape_vector.end()) {
-      int x_rank = x->dims().size();
-      for (size_t i = 0; i < target_shape_vector.size(); i++) {
-        if (target_shape_vector[i] == 0) {
-          PADDLE_ENFORCE_LT(
-              i,
-              x_rank,
-              platform::errors::InvalidArgument(
-                  "The index of 0 in shape attribute or shape tensor",
-                  "should be less than input dim size, ",
-                  "but the index is %d and input dim size is %d",
-                  i,
-                  x_rank));
-          target_shape_vector[i] = x->dims().at(i);
-        }
-      }
-    }
-
-    auto it =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    if (it != target_shape_vector.end()) {
-      auto ddim_out_vec = phi::vectorize(x->dims());
-      int ddim_out_product = std::accumulate(
-          ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies<int>());
-      int reshape_out_product = std::accumulate(target_shape_vector.begin(),
-                                                target_shape_vector.end(),
-                                                -1,
-                                                std::multiplies<int>());
-      int index = std::distance(target_shape_vector.begin(), it);
-      target_shape_vector[index] = ddim_out_product / reshape_out_product;
-    }
-
-    auto out_dims = phi::make_ddim(target_shape_vector);
-    out->mutable_data<T>(out_dims, place);
-
-    NpuOpRunner runner;
-    // the shape input must be on the host side
-    runner.SetType("Reshape")
-        .AddInput(*x)
-        .AddInput(std::vector<int32_t>(target_shape_vector))
-        .AddOutput(*out)
-        .AddAttr("axis", 0)
-        .AddAttr("num_axes", -1);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Reshape2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = d_x->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    reshape2,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    reshape2_grad,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
deleted file mode 100644
index 7d15dc2a46558..0000000000000
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ROIAlignNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");              // (B,C,H,W）
-    auto* ROIs = ctx.Input<phi::DenseTensor>("ROIs");        // (N，4）
-    auto* ROIsNum = ctx.Input<phi::DenseTensor>("RoisNum");  // [0 1 1 2 2 2]
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto sample_num = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-    auto roi_end_mode = 0;
-    PADDLE_ENFORCE_EQ(
-        aligned,
-        false,
-        platform::errors::InvalidArgument(
-            "ROIAlignNPU only support Aligned attribute equaled to False"));
-
-    framework::NPUAttributeMap attr_roi = {{"spatial_scale", spatial_scale},
-                                           {"pooled_height", pooled_height},
-                                           {"pooled_width", pooled_width},
-                                           {"sample_num", sample_num},
-                                           {"roi_end_mode", roi_end_mode}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // Combine *ROIsNum with ROIs to get new ROIs
-    // change roisnum's datatype & resize
-    int dtype =
-        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
-    framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}};
-    phi::DenseTensor ROIsNum_fp(ROIs->dtype());
-    ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1}));
-    ROIsNum_fp.mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner_c =
-        NpuOpRunner("Cast", {*ROIsNum}, {ROIsNum_fp}, attr_cast);
-    runner_c.Run(stream);
-
-    // concate to make (N, 5)
-    std::vector<phi::DenseTensor> x_list;
-    x_list.push_back(ROIsNum_fp);
-    x_list.push_back(*ROIs);
-    auto axis = 1;
-    // output of concate
-    phi::DenseTensor ROIs_N5(ROIs->dtype());
-    ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5}));
-    ROIs_N5.mutable_data<T>(ctx.GetPlace());
-
-    // attribute of concate
-    auto EleNum = 2;
-    framework::NPUAttributeMap attr_concat = {{"N", EleNum},
-                                              {"concat_dim", axis}};
-
-    NpuOpRunner runner0;
-    runner0.SetType("ConcatD")
-        .AddInputs(x_list)
-        .AddOutput(ROIs_N5)
-        .AddInputNames({"x0", "x1"})
-        .AddAttrs(attr_concat);
-    runner0.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("ROIAlign", {*X, ROIs_N5}, {*Out}, attr_roi);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sample_num = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (!in_grad) {
-      return;
-    }
-    in_grad->mutable_data<T>(place);
-
-    PADDLE_ENFORCE_EQ(
-        aligned,
-        false,
-        platform::errors::InvalidArgument(
-            "ROIAlignGradNPU only support Aligned attribute equaled to False"));
-    PADDLE_ENFORCE_EQ(
-        ctx.HasInput("RoisNum"),
-        true,
-        platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp "
-                                   "is not found while using NPU."));
-    PADDLE_ENFORCE_EQ(
-        framework::TransToProtoVarType(rois->dtype()),
-        framework::proto::VarType::FP32,
-        platform::errors::InvalidArgument(
-            "ROIAlignGradNPU only support ROIs type equaled to FP32."));
-
-    // Cast RoisNum to fp32 tensor
-    auto* RoisNum = ctx.Input<phi::DenseTensor>("RoisNum");
-    phi::DenseTensor ROIs_N5;
-    ROIs_N5.mutable_data<float>({rois_num, 5}, place);
-    phi::DenseTensor ROIsNum_fp;
-    ROIsNum_fp.mutable_data<T>(RoisNum->dims(), place);  // shape = [rois_num]
-    int nputype_fp32 =
-        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
-    const auto& runner_cast = NpuOpRunner(
-        "Cast", {*RoisNum}, {ROIsNum_fp}, {{"dst_type", nputype_fp32}});
-    runner_cast.Run(stream);
-    ROIsNum_fp.Resize({rois_num, 1});
-
-    // Combine *ROIsNum with ROIs to get new ROIs
-    std::vector<phi::DenseTensor> x_list;
-    x_list.push_back(ROIsNum_fp);
-    x_list.push_back(*rois);
-    const auto& runner_concat = NpuOpRunner(
-        "ConcatD", {x_list}, {ROIs_N5}, {{"N", 2}, {"concat_dim", 1}});
-    runner_concat.Run(stream);
-
-    //  If CANN version code is less than 504, by analysis, in order to match
-    //  cpu grad version, rois[:,3:5] should substrate 1 before call ascend grad
-    //  function
-#if (CANN_VERSION_CODE < 504000)
-    std::vector<float> vec_dlt = {0, 0, 0, -1.0f, -1.0f};
-    phi::DenseTensor tsr_dlt;
-    tsr_dlt.mutable_data<float>({5}, place);
-    framework::TensorFromVector<float>(vec_dlt, ctx.device_context(), &tsr_dlt);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {});
-    runner_add.Run(stream);
-#endif
-
-    //  Call ascend RoiAlignGrad function
-    int roi_end_mode = 0;
-    const auto& runner_roi_align_grad =
-        NpuOpRunner("ROIAlignGrad",
-                    {*out_grad, ROIs_N5},
-                    {*in_grad},
-                    {{"xdiff_shape", phi::vectorize<int>(in_dims)},
-                     {"pooled_width", pooled_width},
-                     {"pooled_height", pooled_height},
-                     {"spatial_scale", spatial_scale},
-                     {"sample_num", sample_num},
-                     {"roi_end_mode", roi_end_mode}});
-    runner_roi_align_grad.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    roi_align,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(roi_align_grad,
-                       ops::ROIAlignNPUGradKernel<float>,
-                       ops::ROIAlignNPUGradKernel<double>,
-                       ops::ROIAlignNPUGradKernel<int>);
diff --git a/paddle/fluid/operators/run_program_op_npu.cc b/paddle/fluid/operators/run_program_op_npu.cc
deleted file mode 100644
index e45ce0a2bef9f..0000000000000
--- a/paddle/fluid/operators/run_program_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/sampling_id_op_npu.cc b/paddle/fluid/operators/sampling_id_op_npu.cc
deleted file mode 100644
index 5657edcfa35bb..0000000000000
--- a/paddle/fluid/operators/sampling_id_op_npu.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(sampling_id,
-                       paddle::operators::SamplingIdKernel<float>,
-                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
deleted file mode 100644
index 1fb136a5110db..0000000000000
--- a/paddle/fluid/operators/save_combine_op_npu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
deleted file mode 100644
index d6063d66f1531..0000000000000
--- a/paddle/fluid/operators/save_op_npu.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    save,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
deleted file mode 100644
index c25a49c4f3b60..0000000000000
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T GetAttrFromTensor(const phi::DenseTensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  phi::DenseTensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
-template <typename T>
-class ScaleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto scale = ctx.Attr<float>("scale");
-    auto bias = ctx.Attr<float>("bias");
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    float power = 1.0;
-    VLOG(4) << "scale:" << scale << ", bias:" << bias
-            << " ,bias_after_scale:" << bias_after_scale;
-    if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<phi::DenseTensor>("ScaleTensor");
-      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
-    }
-    if (isinf(scale)) {
-      if (signbit(scale)) {
-        scale = -std::numeric_limits<float>::max();
-      } else {
-        scale = std::numeric_limits<float>::max();
-      }
-    }
-    if (!bias_after_scale) {
-      bias *= scale;
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attrs = {
-        {"power", power}, {"scale", scale}, {"shift", bias}};
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<Tensor>& inputs,
-                      const std::vector<Tensor>& outputs,
-                      const NPUAttributeMap& attrs,
-                      const platform::NPUDeviceContext& dev_ctx) {
-      const auto& muls_runner = NpuOpRunner(
-          "Muls", {inputs[0]}, {outputs[0]}, {{"value", attrs.at("scale")}});
-      muls_runner.Run(dev_ctx.stream());
-
-      const auto& adds_runner = NpuOpRunner(
-          "Adds", {outputs[0]}, {outputs[0]}, {{"value", attrs.at("shift")}});
-      adds_runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner::TypeAdapter({*x},
-                               {*out},
-                               attrs,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else if (framework::TransToProtoVarType(x->dtype()) ==
-               framework::proto::VarType::INT64) {
-      NpuOpRunner::TypeAdapter({*x},
-                               {*out},
-                               attrs,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs);
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    scale,
-    paddle::operators::ScaleNPUKernel<float>,
-    paddle::operators::ScaleNPUKernel<paddle::platform::float16>,
-    paddle::operators::ScaleNPUKernel<int64_t>,
-    paddle::operators::ScaleNPUKernel<int>);
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
deleted file mode 100644
index b2b09faaa9d44..0000000000000
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc
deleted file mode 100644
index 1843e993d552a..0000000000000
--- a/paddle/fluid/operators/seed_op_npu.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/seed_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUSeedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int user_seed = ctx.Attr<int>("seed");
-    std::random_device rnd;
-    int seed;
-
-    if (user_seed != 0) {
-      seed = user_seed;
-    } else {
-      seed = rnd();
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(out, seed);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    seed, ops::NPUSeedKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
deleted file mode 100644
index 3978923d46af7..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SequenceMaskNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int maxlen = ctx.Attr<int>("maxlen");
-
-    if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
-      PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
-                              platform::errors::InvalidArgument(
-                                  "Input(MaxLenTensor) should not be NULL."
-                                  "But received Input(MaxLenTensor) is NULL"));
-      phi::DenseTensor temp;
-      paddle::framework::TensorCopySync(
-          *max_len_tensor, platform::CPUPlace(), &temp);
-      maxlen = *temp.data<int32_t>();
-      PADDLE_ENFORCE_GT(
-          maxlen,
-          0,
-          platform::errors::InvalidArgument(
-              "Input(MaxLenTensor) value should be greater than 0. But "
-              "received Input(MaxLenTensor) value = %d.",
-              maxlen));
-    }
-
-    if (maxlen < 0) {
-      auto x_numel = x->numel();
-      if (x_numel == 0) {
-        maxlen = 0;
-      } else {
-        std::vector<T> x_vec;
-        framework::TensorToVector(*x, dev_ctx, &x_vec);
-        auto x_data = x_vec.data();
-        maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
-      }
-    }
-    auto y_dim = phi::vectorize<int>(x->dims());
-    y_dim.push_back(maxlen);
-
-    phi::DenseTensor cast_x;
-    cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
-    const auto& cast1_runner = NpuOpRunner(
-        "Cast",
-        {*x},
-        {cast_x},
-        {{"dst_type",
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}});
-    cast1_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor tmp;
-    tmp.mutable_data<int32_t>(phi::make_ddim({maxlen}), ctx.GetPlace());
-    NpuOpRunner range_runner;
-    range_runner.SetType("Range");
-    range_runner.AddInput(std::vector<int32_t>({0}));
-    range_runner.AddInput(std::vector<int32_t>({maxlen}));
-    range_runner.AddInput(std::vector<int32_t>({1}));
-    range_runner.AddOutput(tmp);
-    range_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor expand_tmp;
-    expand_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& expand_runner =
-        NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
-    expand_runner.Run(dev_ctx.stream());
-
-    auto x_dims = phi::vectorize<int>(x->dims());
-    x_dims.push_back(1);
-    cast_x.Resize(phi::make_ddim({x_dims}));
-    phi::DenseTensor x_tmp;
-    x_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& tile_runner =
-        NpuOpRunner("TileWithAxis",
-                    {cast_x},
-                    {x_tmp},
-                    {{"axis", x->dims().size()}, {"tiles", maxlen}});
-    tile_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor y_tmp;
-    y_tmp.mutable_data<uint8_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& less_runner =
-        NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
-    less_runner.Run(dev_ctx.stream());
-
-    y->Resize(phi::make_ddim(y_dim));
-    auto out_dtype = static_cast<framework::proto::VarType::Type>(
-        ctx.Attr<int>("out_dtype"));
-    if (out_dtype == framework::proto::VarType::INT32) {
-      y->mutable_data<int32_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::INT64) {
-      y->mutable_data<int64_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP32) {
-      y->mutable_data<float>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP64) {
-      y->mutable_data<double>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::BOOL) {
-      y->mutable_data<bool>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::UINT8) {
-      y->mutable_data<uint8_t>(ctx.GetPlace());
-    } else {
-      PADDLE_ENFORCE(false,
-                     platform::errors::InvalidArgument(
-                         "out_dtype only supporing int32, int64, fp32, fp64, "
-                         "bool, uint8, but receive out_dtype is %d",
-                         out_dtype));
-    }
-
-    const auto& cast2_runner = NpuOpRunner(
-        "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}});
-    cast2_runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    sequence_mask,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int32_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
deleted file mode 100644
index b572e98eb81e9..0000000000000
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/set_value_op.h"
-#include "paddle/phi/kernels/funcs/slice_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class SetValueNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<phi::DenseTensor>("Input");
-    auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims =
-        phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims =
-        phi::funcs::GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    paddle::framework::TensorCopy(*in, ctx.GetPlace(), out);
-
-    auto starts_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto ends_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto strides_indices = std::vector<int64_t>(in_dims.size(), 0);
-
-    for (int i = 0; i < in_dims.size(); ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-    }
-
-    int64_t stride_step = phi::product(in_dims);
-    std::vector<int64_t> index_indices(1, 0);
-    for (size_t i = 0; i < strides_indices.size(); ++i) {
-      auto index_size = index_indices.size();
-      stride_step /= in_dims[i];
-      for (size_t j = 0; j < index_size; ++j) {
-        auto start_index = *index_indices.begin();
-        if (strides_indices[i] > 0) {
-          for (int64_t k = starts_indices[i]; k < ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        } else {
-          for (int64_t k = starts_indices[i]; k > ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        }
-        index_indices.erase(index_indices.begin());
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(index_indices.size()),
-        phi::product(slice_dims_for_assign),
-        platform::errors::InvalidArgument(
-            "OP(set_value) error index indices and value update not match "));
-
-    phi::DenseTensor value_t(in->type());
-    if (value_tensor != nullptr) {
-      value_t.ShareDataWith(*value_tensor);
-    } else {
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, ctx.GetPlace());
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVectorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    phi::DenseTensor value_temp(in->type());
-    if (slice_dims_for_assign == value_t.dims()) {
-      value_temp.ShareDataWith(value_t);
-    } else {
-      value_temp.Resize(slice_dims_for_assign);
-      value_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(value_t)
-          .AddInput(phi::vectorize(slice_dims_for_assign))
-          .AddOutput(value_temp)
-          .Run(stream);
-    }
-
-    int64_t input_numel = phi::product(in_dims);
-    int64_t index_numel = index_indices.size();
-
-    phi::DenseTensor in_temp, out_temp, val_temp;
-    in_temp.ShareDataWith(*in);
-    out_temp.ShareDataWith(*out);
-    val_temp.ShareDataWith(value_temp);
-    in_temp.Resize(phi::make_ddim({input_numel}));
-    out_temp.Resize(phi::make_ddim({input_numel}));
-    val_temp.Resize(phi::make_ddim({index_numel}));
-
-    NpuOpRunner runner;
-    runner.SetType("ScatterUpdate")
-        .AddInput(in_temp)
-        .AddInput(std::move(index_indices))
-        .AddInput(val_temp)
-        .AddOutput(out_temp)
-#if (CANN_VERSION_CODE >= 504000)
-        .AddAttrs({{"use_locking", false}})
-#endif
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(set_value,
-                       ops::SetValueNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::SetValueNPUKernel<int64_t>,
-#endif
-                       ops::SetValueNPUKernel<float>)
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
deleted file mode 100644
index 76f4539e70b2f..0000000000000
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ShapeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("Input");
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    out_t->Resize({x->dims().size()});
-    out_t->mutable_data<int32_t>(ctx.GetPlace());
-
-    // The output data type defaults to int32.
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-    runner.SetType("Shape").AddInput(*x).AddOutput(*out_t).AddAttr(
-        "dtype", static_cast<int>(dst_dtype));
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    shape,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
deleted file mode 100644
index 4181db1d8e04c..0000000000000
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ShardIndexNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "start kernel";
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-
-    PADDLE_ENFORCE_GT(
-        index_num,
-        0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id,
-        0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id,
-        nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards,
-            shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    auto place = context.GetPlace();
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    out->mutable_data<T>(place);
-
-    phi::DenseTensor tmp(in->type());
-    tmp.mutable_data<T>(framework::DDim({1}), place);
-    FillNpuTensorWithConstant(&tmp, shard_size);
-
-    phi::DenseTensor condition(phi::DataType::BOOL);
-    condition.mutable_data<bool>(in->dims(), place);
-
-    phi::DenseTensor tmp2(in->type());
-    tmp2.mutable_data<T>(in->dims(), place);
-
-    phi::DenseTensor tmp3(in->type());
-    tmp3.mutable_data<T>(in->dims(), place);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.AddInputs({*in, tmp});
-    runner.AddOutputs({tmp2});
-    runner.SetType("Mod");
-    runner.Run(stream);
-
-    NpuOpRunner runner1;
-    runner1.AddInputs({*in, tmp});
-    runner1.AddOutputs({tmp3});
-    runner1.SetType("FloorDiv");
-    runner1.Run(stream);
-
-    FillNpuTensorWithConstant(&tmp, shard_id);
-    NpuOpRunner runner2;
-    runner2.AddInputs({tmp3, tmp});
-    runner2.AddOutputs({condition});
-    runner2.SetType("Equal");
-    runner2.Run(stream);
-
-    phi::DenseTensor tmp4(in->type());
-    tmp4.mutable_data<T>(in->dims(), place);
-    FillNpuTensorWithConstant(&tmp4, ignore_value);
-    tmp4.Resize(in->dims());
-
-    NpuOpRunner runner3;
-    runner3.AddInputs({condition, tmp2, tmp4});
-    runner3.AddOutputs({*out});
-    runner3.SetType("Select");
-    runner3.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(shard_index,
-                       ops::ShardIndexNPUKernel<int>,
-                       ops::ShardIndexNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
deleted file mode 100644
index 0d4ad6331e807..0000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const int kIgnoreIndex = -100;
-
-void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // Add this check is due to Ascend SigmoidCrossEntropyWithLogits
-  // and SigmoidCrossEntropyWithLogitsGrad does't supoort
-  // attr normalize and ignore_index
-  bool normalize = ctx.Attr<bool>("normalize");
-  int ignore_index = ctx.Attr<int>("ignore_index");
-  PADDLE_ENFORCE_EQ(normalize,
-                    false,
-                    platform::errors::InvalidArgument(
-                        "attr normalize must be false, but got true"));
-  PADDLE_ENFORCE_EQ(ignore_index,
-                    kIgnoreIndex,
-                    platform::errors::InvalidArgument(
-                        "attr ignore_index must be default %d, but got %d",
-                        kIgnoreIndex,
-                        ignore_index));
-}
-
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("SigmoidCrossEntropyWithLogits", {*x, *label}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsNPUGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner(
-        "SigmoidCrossEntropyWithLogitsGrad", {*x, *label, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SigmoidCrossEntropyWithLogitsNPUKernel<plat::NPUDeviceContext,
-                                                plat::float16>);
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_cross_entropy_with_logits_grad,
-    ops::SigmoidCrossEntropyWithLogitsNPUGradKernel<plat::NPUDeviceContext,
-                                                    float>,
-    ops::SigmoidCrossEntropyWithLogitsNPUGradKernel<plat::NPUDeviceContext,
-                                                    plat::float16>);
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
deleted file mode 100644
index 594b0cc18e886..0000000000000
--- a/paddle/fluid/operators/size_op_npu.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SizeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    Tensor cpu_tensor;
-    auto cpu_data =
-        cpu_tensor.mutable_data<int64_t>(out->dims(), platform::CPUPlace());
-    cpu_data[0] = x->numel();
-    paddle::framework::TensorCopy(
-        cpu_tensor,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        out);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    size,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
deleted file mode 100644
index a54ba630b274c..0000000000000
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/slice_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-void UpdateAttr(const framework::DDim& in_dims,
-                const std::vector<int> axes,
-                const std::vector<int> starts,
-                const std::vector<int> ends,
-                std::vector<int>* offsets,
-                std::vector<int>* size) {
-  int cnt = 0;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    int start = 0;
-    int end = in_dims[i];
-    // NOTE(zhiqiu): Becareful that cnt may > axes.size() and result in
-    // overflow.
-    int axis = cnt < static_cast<int>(axes.size()) ? axes[cnt] : -1;
-    if (axis == i) {
-      start = starts[cnt];
-      if (start < 0) {
-        start = (start + in_dims[i]);
-      }
-      start = std::max(start, static_cast<int>(0));
-      end = ends[cnt];
-      if (end < 0) {
-        end = (end + in_dims[i]);
-      }
-      end = std::min(end, static_cast<int>(in_dims[i]));
-      cnt++;
-    }
-
-    (*offsets)[i] = start;
-    (*size)[i] = end - start;
-  }
-}
-
-template <typename T>
-class SliceNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    std::vector<int> axes(axes_int.begin(), axes_int.end());
-    std::vector<int> starts(starts_int.begin(), starts_int.end());
-    std::vector<int> ends(ends_int.begin(), ends_int.end());
-
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-
-    const auto& in_dims = input->dims();
-
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        starts.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
-    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
-        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
-      // Infer output dims
-      auto out_dims = out->dims();
-      auto slice_dims = out_dims;
-      for (size_t i = 0; i < axes.size(); ++i) {
-        // when start == -1 && end == start+1
-        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-          auto ret =
-              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-          if (ret != decrease_axis.end()) {
-            ends[i] = in_dims[axes[i]];
-          }
-        }
-      }
-
-      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-      slice_dims = phi::funcs::GetSliceDims<int>(
-          in_dims, axes, starts, ends, nullptr, nullptr);
-      out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
-
-      out->Resize(out_dims);
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::vector<int> offsets(in_dims.size());
-    std::vector<int> size(in_dims.size());
-
-    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-#if CANN_VERSION_CODE < 512000
-    const auto& runner =
-        NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {
-                                                   "size",
-                                                   size
-                                                 }});
-#else
-    NpuOpRunner runner;
-    runner.SetType("Slice")
-        .AddInput(*input)
-        .AddInput(std::move(offsets))
-        .AddInput(std::move(size))
-        .AddOutput(*out);
-#endif
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class SliceGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dinput =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    std::vector<int> axes(axes_int.begin(), axes_int.end());
-    std::vector<int> starts(starts_int.begin(), starts_int.end());
-    std::vector<int> ends(ends_int.begin(), ends_int.end());
-
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-
-    const auto& in_dims = input->dims();
-    int rank = in_dims.size();
-
-    std::vector<int> offsets(rank);
-    std::vector<int> size(rank);
-    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
-
-    std::vector<std::vector<int64_t>> paddings(rank, std::vector<int64_t>(2));
-    for (int i = 0; i < rank; ++i) {
-      paddings[i][0] = static_cast<int64_t>(offsets[i]);
-      paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
-    }
-
-    phi::DenseTensor tmp_dout;
-    tmp_dout.ShareDataWith(*dout);
-    auto out_dims = dout->dims();
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto decrease_size = decrease_axis.size();
-    if (decrease_size > 0) {
-      if (decrease_size == static_cast<size_t>(in_dims.size())) {
-        out_dims = phi::make_ddim(std::vector<int>(decrease_size, 1));
-      } else {
-        std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
-        for (size_t i = 0; i < decrease_size; ++i) {
-          origin_out_shape[decrease_axis[i]] = 1;
-        }
-        int index = 0;
-        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
-          if (origin_out_shape[i] == -1) {
-            origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-        out_dims = phi::make_ddim(origin_out_shape);
-      }
-      tmp_dout.Resize(out_dims);
-    }
-
-    dinput->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(slice,
-                       ops::SliceNPUKernel<float>,
-                       ops::SliceNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::SliceNPUKernel<int64_t>,
-#endif
-                       ops::SliceNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(slice_grad,
-                       ops::SliceGradNPUKernel<float>,
-                       ops::SliceGradNPUKernel<int>,
-                       ops::SliceGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
deleted file mode 100644
index abb6353ca0d1d..0000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-    auto* in_y = context.Input<phi::DenseTensor>("Y");
-    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* out_diff = context.Output<phi::DenseTensor>("Diff");
-    auto* out_loss = context.Output<phi::DenseTensor>("Out");
-    out_diff->mutable_data<T>(context.GetPlace());
-    out_loss->mutable_data<T>(context.GetPlace());
-
-    auto sigma = context.Attr<float>("sigma");
-    T sigma2 = 1.0 / (sigma * sigma);
-    bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr);
-    // out_diff = in_x - in_y
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {});
-    runner1.Run(stream);
-
-    phi::DenseTensor no_reduce_loss(in_x->dtype());
-    no_reduce_loss.Resize(in_x->dims());
-    no_reduce_loss.mutable_data<T>(context.GetPlace());
-    // multiply inside weight before get the loss
-    if (has_weight) {
-      phi::DenseTensor tmp_diff(out_diff->dtype());
-      tmp_diff.Resize(out_diff->dims());
-      tmp_diff.mutable_data<T>(context.GetPlace());
-      const auto& runner2 =
-          NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {});
-      runner2.Run(stream);
-      framework::TensorCopy(
-          tmp_diff,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          out_diff);
-
-      phi::DenseTensor tmp_x(in_x->dtype());
-      tmp_x.Resize(in_x->dims());
-      tmp_x.mutable_data<T>(context.GetPlace());
-
-      phi::DenseTensor tmp_y(in_y->dtype());
-      tmp_y.Resize(in_y->dims());
-      tmp_y.mutable_data<T>(context.GetPlace());
-
-      // mul input and inside_weight
-      const auto& runner_x =
-          NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {});
-      runner_x.Run(stream);
-      const auto& runner_y =
-          NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {});
-      runner_y.Run(stream);
-      const auto& runner3 = NpuOpRunner("SmoothL1Loss",
-                                        {tmp_x, tmp_y},
-                                        {no_reduce_loss},
-                                        {{"sigma", sigma2}});
-      runner3.Run(stream);
-    } else {
-      const auto& runner3 = NpuOpRunner("SmoothL1Loss",
-                                        {*in_x, *in_y},
-                                        {no_reduce_loss},
-                                        {{"sigma", sigma2}});
-      runner3.Run(stream);
-    }
-
-    // multiply outside weight and loss
-    // reduceSum because the output'shape must be [B,1]
-    if (has_weight) {
-      phi::DenseTensor tmp_loss(no_reduce_loss.dtype());
-      tmp_loss.Resize(no_reduce_loss.dims());
-      tmp_loss.mutable_data<T>(context.GetPlace());
-      const auto& runner4 =
-          NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {});
-      runner4.Run(stream);
-      const auto& runner5 =
-          NpuOpRunner("ReduceSumD",
-                      {tmp_loss},
-                      {*out_loss},
-                      {{"axes", std::vector<int>{1}}, {"keep_dims", true}});
-      runner5.Run(stream);
-    } else {
-      const auto& runner5 =
-          NpuOpRunner("ReduceSumD",
-                      {no_reduce_loss},
-                      {*out_loss},
-                      {{"axes", std::vector<int>{1}}, {"keep_dims", true}});
-      runner5.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* diff = context.Input<phi::DenseTensor>("Diff");
-    auto* og = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* outx_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* outy_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto sigma = context.Attr<T>("sigma");
-    T sigma2 = 1.0 / (sigma * sigma);
-    bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // diff == in_x - in_y == diff - 0
-    phi::DenseTensor tmp_zero(diff->dtype());
-    tmp_zero.Resize(diff->dims());
-    tmp_zero.mutable_data<T>(context.GetPlace());
-    const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {});
-    runner_zero.Run(stream);
-
-    phi::DenseTensor grad(diff->dtype());
-    grad.Resize(diff->dims());
-    grad.mutable_data<T>(context.GetPlace());
-    // broadcast og(output_grad) to adapt to the npu interface
-    const auto& runner_broad =
-        NpuOpRunner("BroadcastToD",
-                    {*og},
-                    {grad},
-                    {{"shape", phi::vectorize(diff->dims())}});
-    runner_broad.Run(stream);
-
-    phi::DenseTensor gradient(diff->dtype());
-    gradient.Resize(diff->dims());
-    gradient.mutable_data<T>(context.GetPlace());
-    // diff == diff - 0 == in_x - in_y
-    const auto& runner_grad = NpuOpRunner("SmoothL1LossGrad",
-                                          {*diff, tmp_zero, grad},
-                                          {gradient},
-                                          {{"sigma", sigma2}});
-    runner_grad.Run(stream);
-
-    // mul weight and gradient
-    if (has_weight) {
-      phi::DenseTensor weight(inside_weight->dtype());
-      weight.Resize(inside_weight->dims());
-      weight.mutable_data<T>(context.GetPlace());
-      const auto& runner_weight =
-          NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {});
-      runner_weight.Run(stream);
-
-      phi::DenseTensor tmp_grad(gradient.dtype());
-      tmp_grad.Resize(gradient.dims());
-      tmp_grad.mutable_data<T>(context.GetPlace());
-      const auto& runner_weight_grad =
-          NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {});
-      runner_weight_grad.Run(stream);
-
-      framework::TensorCopy(
-          tmp_grad,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          &gradient);
-    }
-    // outx_grad = gradient
-    if (outx_grad) {
-      outx_grad->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(
-          gradient,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          outx_grad);
-    }
-
-    // outy_grad = - gradient
-    if (outy_grad) {
-      outy_grad->mutable_data<T>(context.GetPlace());
-      phi::DenseTensor coeff(phi::DataType::FLOAT32);
-      coeff.mutable_data<float>({1}, context.GetPlace());
-      FillNpuTensorWithConstant<float>(&coeff, -1);
-      const auto& runner_y_grad =
-          NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {});
-      runner_y_grad.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossNPUKernel<paddle::platform::NPUDeviceContext, float>);
-
-REGISTER_OP_NPU_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
deleted file mode 100644
index de7df0de5b3d5..0000000000000
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SoftmaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto axis = ctx.Attr<int>("axis");
-    std::vector<int> axes;
-    axes.push_back(axis);
-    framework::NPUAttributeMap attr_input = {{"axes", axes}};
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto dims = dX->dims();
-    const int rank = dims.size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    int64_t first_dim = 1;
-    int64_t sec_dim = 1;
-    for (int i = 0; i < axis; i++) {
-      first_dim *= dims[i];
-    }
-    for (int i = axis; i < rank; i++) {
-      sec_dim *= dims[i];
-    }
-
-    Tensor tmp_out;
-    tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim});
-
-    Tensor tmp_dOut;
-    tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim});
-
-    dX->Resize(phi::make_ddim({first_dim, sec_dim}));
-    dX->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {};
-    const auto& runner = NpuOpRunner(
-        std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, {*dX}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    dX->Resize(dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    softmax,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, double>,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext,
-                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
deleted file mode 100644
index dd1462b1c07cc..0000000000000
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(softmax);
-USE_OP_DEVICE_KERNEL(softmax, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  for (int i = 3; i < 9; ++i) {
-    init.push_back(static_cast<T>(i));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({2, 3});
-  tensor_out->mutable_data<T>(place);  // allocate
-
-  // run
-  int axis = 1;
-  f::AttributeMap attrs = {
-      {"axis", axis},
-      {"use_cudnn", false},
-      {"use_mkldnn", false},
-      {"mkldnn_data_type", std::string("float32")},
-      {"is_test", false},
-  };
-
-  auto op = f::OpRegistry::CreateOp(
-      "softmax", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
-  }
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> out_init;
-
-  out_init.push_back(static_cast<T>(0.6670));
-  out_init.push_back(static_cast<T>(0.5888));
-  out_init.push_back(static_cast<T>(0.4543));
-  out_init.push_back(static_cast<T>(0.3330));
-  out_init.push_back(static_cast<T>(0.4112));
-  out_init.push_back(static_cast<T>(0.5457));
-
-  paddle::framework::TensorFromVector(out_init, ctx, tensor_out);
-  tensor_out->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> dout_init;
-  for (int i = 0; i < 6; ++i) {
-    dout_init.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout);
-  tensor_dout->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs = {
-      {"name", std::string("softmax_grad")},
-      {"axis", static_cast<int>(0)},
-      {"use_cudnn", false},
-      {"use_mkldnn", false},
-      {"mkldnn_data_type", std::string("float32")},
-      {"is_test", false},
-      {"data_format", std::string("AnyLayout")},
-  };
-  auto op = f::OpRegistry::CreateOp("softmax_grad",
-                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
-                                    {{"X@GRAD", {"DX"}}},
-                                    attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2));
-  EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3));
-
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
-  EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1);
-  EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1);
-  EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1);
-  EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1);
-  EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1);
-  EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1);
-}
-
-TEST(softmax, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(softmax_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
deleted file mode 100644
index af0e9d55445d5..0000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-#include "paddle/phi/kernels/funcs/cross_entropy.h"
-#include "paddle/phi/kernels/funcs/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<phi::DenseTensor>("Logits");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
-    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
-    auto soft_label = ctx.Attr<bool>("soft_label");
-    PADDLE_ENFORCE_EQ(soft_label,
-                      false,
-                      platform::errors::Unimplemented(
-                          "soft_label=True is not supported in "
-                          "the npu kernel of softmax_with_cross_entropy."));
-
-    const int rank = logits->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
-
-    PADDLE_ENFORCE_EQ(
-        labels->numel(),
-        n,
-        platform::errors::Unimplemented(
-            "The size of labels should be equal to phi::funcs::SizeToAxis of "
-            "logits,"
-            "but got size of labels is %d and phi::funcs::SizeToAxis is %d.",
-            labels->numel(),
-            n));
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    backprop->mutable_data<T>(ctx.GetPlace());
-    softmax->mutable_data<T>(ctx.GetPlace());
-
-    phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
-    logits_2d.ShareDataWith(*logits).Resize({n, d});
-    labels_1d.ShareDataWith(*labels).Resize({n});
-    loss_1d.ShareDataWith(*loss).Resize({n});
-    backprop_2d.ShareDataWith(*backprop).Resize({n, d});
-    softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    for (auto i = axis; i < logits->dims().size(); ++i) {
-      axes.push_back(i);
-    }
-    const auto& runner_softmax =
-        NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
-    runner_softmax.Run(stream);
-
-    // SparseSoftmaxCrossEntropyWithLogits
-    const auto& runner_s = NpuOpRunner("SparseSoftmaxCrossEntropyWithLogits",
-                                       {logits_2d, labels_1d},
-                                       {loss_1d, backprop_2d},
-                                       {});
-    runner_s.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
-    auto* loss_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* logits_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
-
-    PADDLE_ENFORCE_NOT_NULL(backprop,
-                            platform::errors::PreconditionNotMet(
-                                "backprop should not be null in NPU kernel of "
-                                "softmax_with_cross_entropy_grad."));
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-
-    const int rank = logits_grad->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
-
-    phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d;
-
-    logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d});
-    loss_grad_1d.ShareDataWith(*loss_grad).Resize({n});
-    backprop_2d.ShareDataWith(*backprop).Resize({n, d});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_mul =
-        NpuOpRunner("Mul", {*loss_grad, *backprop}, {*logits_grad}, {});
-    runner_mul.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
-                                          float>,
-    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
-                                          paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           float>,
-                       ops::SoftmaxWithCrossEntropyGradNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
deleted file mode 100644
index 763b375d00e9b..0000000000000
--- a/paddle/fluid/operators/split_op_npu.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SplitNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    int num = ctx.Attr<int>("num");
-    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
-    int axis = ctx.Attr<int>("axis");
-
-    if (ctx.HasInput("AxisTensor")) {
-      // TODO(liupeng51):
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The AxisTensor is not supported on NPU now."));
-    }
-    if (ctx.HasInput("SectionsTensorList")) {
-      // TODO(liupeng51):
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The SectionsTensorList is not supported on NPU now."));
-    }
-
-    std::vector<phi::DenseTensor> outputs;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      outputs.push_back(*outs[j]);
-    }
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    if (sections.size() == 0) {
-      framework::NPUAttributeMap attr_input = {{"num_split", num},
-                                               {"split_dim", axis}};
-      runner.SetType("SplitD").AddInputs({*in}).AddOutputs(outputs).AddAttrs(
-          attr_input);
-    } else {
-      framework::NPUAttributeMap attr_input = {
-          {"size_splits", sections},
-          {"split_dim", axis},
-          {"num_split", static_cast<int32_t>(sections.size())}};
-      runner.SetType("SplitVD").AddInput(*in).AddOutputs(outputs).AddAttrs(
-          attr_input);
-    }
-
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(split,
-                       ops::SplitNPUKernel<float>,
-                       ops::SplitNPUKernel<int>,
-                       ops::SplitNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
deleted file mode 100644
index fb7d4607fc085..0000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axis;
-    for (int i = 0; i < x->dims().size(); ++i) {
-      axis.push_back(i);
-    }
-    out->mutable_data<T>(place);
-    const auto &runner = NpuOpRunner(
-        "SquareSumV1", {*x}, {*out}, {{"axis", axis}, {"keep_dims", false}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        out_grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // broadcast out_grad
-    phi::DenseTensor broadcasted_out_grad;
-    broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
-    const auto &broadcast_runner =
-        NpuOpRunner("BroadcastToD",
-                    {*out_grad},
-                    {broadcasted_out_grad},
-                    {{"shape", phi::vectorize(x_grad->dims())}});
-    broadcast_runner.Run(stream);
-    // mul x
-    phi::DenseTensor tmp_x_grad;
-    tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
-    const auto &mul_x_runner =
-        NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {});
-    mul_x_runner.Run(stream);
-    // mul coefficient:2
-    phi::DenseTensor coefficient;
-    coefficient.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&coefficient, static_cast<T>(2.0));
-    x_grad->mutable_data<T>(place);
-    const auto &mul_coefficient_runner =
-        NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {});
-    mul_coefficient_runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormNPUKernel<plat::NPUDeviceContext, float>);
-REGISTER_OP_NPU_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradNPUKernel<plat::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc
deleted file mode 100644
index 308f092ad740f..0000000000000
--- a/paddle/fluid/operators/squeeze_op_npu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    squeeze,
-    ops::SqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze2,
-    ops::SqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
deleted file mode 100644
index f0f683e488246..0000000000000
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(squeeze);
-USE_OP_DEVICE_KERNEL(squeeze, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 1;
-  int dim1 = 10;
-  int dim2 = 1;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) {
-    init.push_back(static_cast<T>(0.1));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({dim0, dim1, dim2});
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> axis;
-  axis.push_back(2);
-  f::AttributeMap attrs = {{"axes", axis}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "squeeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
-  }
-
-  ctx.Wait();
-}
-
-TEST(squeeze, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
deleted file mode 100644
index 8c6447971d9ad..0000000000000
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class StackNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-
-    PADDLE_ENFORCE_GT(num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "number of input phi::DenseTensor <= 0"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> x_list;
-    for (int i = 0; i < num; i++) {
-      x_list.push_back(*x[i]);
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class StackGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int num = dy->dims()[axis];
-
-    PADDLE_ENFORCE_GT(num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "number of input phi::DenseTensor <= 0"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> dx_list;
-    for (int i = 0; i < num; i++) {
-      dx[i]->mutable_data<T>(ctx.GetPlace());
-      dx_list.push_back(*dx[i]);
-    }
-
-    const auto& runner =
-        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    stack,
-    paddle::operators::StackNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::StackNPUKernel<int64_t>,
-#endif
-    paddle::operators::StackNPUKernel<float>,
-    paddle::operators::StackNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    stack_grad,
-    paddle::operators::StackGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::StackGradNPUKernel<int64_t>,
-#endif
-    paddle::operators::StackGradNPUKernel<float>,
-    paddle::operators::StackGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
deleted file mode 100644
index 4c3bfed5d5d4b..0000000000000
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/strided_slice.h"
-
-namespace paddle {
-namespace operators {
-
-using Variable = framework::Variable;
-using LoDTensorArray = framework::LoDTensorArray;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T>
-class StridedSliceNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 6."));
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto in = ctx.Input<phi::DenseTensor>("Input");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto in_dims = in->dims();
-
-    // list<int>
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    // vector<phi::DenseTensor<int32>>
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-
-    // phi::DenseTensor<int32>
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-
-    // out dims calculation
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    in_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-
-    // check whether need to reverse (false: stride > 0; true: stride < 0)
-    std::vector<int> reverse_vector(starts.size(), 0);
-    phi::funcs::StridedSliceFunctor(starts.data(),
-                                    ends.data(),
-                                    strides.data(),
-                                    axes.data(),
-                                    reverse_vector.data(),
-                                    in_dims,
-                                    infer_flags,
-                                    decrease_axis,
-                                    starts.size());
-
-    // construct the starts_indices, ends_indices and strides_indices tensor for
-    // calling StridedSlice op
-    std::vector<int64_t> starts_indices_vector(D, 0);
-    std::vector<int64_t> ends_indices_vector(out_dims_vector.begin(),
-                                             out_dims_vector.end());
-    std::vector<int64_t> strides_indices_vector(D, 1);
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices_vector[axis_index] = starts[axis];
-      ends_indices_vector[axis_index] = ends[axis];
-      strides_indices_vector[axis_index] = strides[axis];
-    }
-
-    phi::DenseTensor starts_indices_tensor;
-    phi::DenseTensor ends_indices_tensor;
-    phi::DenseTensor strides_indices_tensor;
-
-    starts_indices_tensor.mutable_data<int64_t>({D}, place);
-    ends_indices_tensor.mutable_data<int64_t>({D}, place);
-    strides_indices_tensor.mutable_data<int64_t>({D}, place);
-
-    paddle::framework::TensorFromVector(
-        starts_indices_vector, ctx.device_context(), &starts_indices_tensor);
-    paddle::framework::TensorFromVector(
-        ends_indices_vector, ctx.device_context(), &ends_indices_tensor);
-    paddle::framework::TensorFromVector(
-        strides_indices_vector, ctx.device_context(), &strides_indices_tensor);
-
-    auto out_dims_origin = out_dims;
-    if (decrease_axis.size() > 0) {
-      std::vector<int64_t> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        PADDLE_ENFORCE_EQ(
-            out_dims[decrease_axis[i]],
-            1,
-            platform::errors::InvalidArgument(
-                "the size of decrease dimension should be 1, but received %d.",
-                out_dims[decrease_axis[i]]));
-        out_dims_origin[decrease_axis[i]] = 0;
-      }
-
-      for (int i = 0; i < out_dims_origin.size(); ++i) {
-        if (out_dims_origin[i] != 0) {
-          new_out_shape.push_back(out_dims_origin[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-      out_dims_origin = phi::make_ddim(new_out_shape);
-    }
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    out->Resize(out_dims);
-    out->mutable_data<T>(place);
-
-    const auto& runner = NpuOpRunner("StridedSlice",
-                                     {*in,
-                                      starts_indices_tensor,
-                                      ends_indices_tensor,
-                                      strides_indices_tensor},
-                                     {*out},
-                                     {{"begin_mask", 0},
-                                      {"end_mask", 0},
-                                      {"ellipsis_mask", 0},
-                                      {"new_axis_mask", 0},
-                                      {"shrink_axis_mask", 0}});
-    runner.Run(stream);
-
-    if (need_reverse) {
-      phi::DenseTensor out_tmp;
-      out_tmp.mutable_data<T>(out_dims, place);
-      paddle::framework::TensorCopy(
-          *out,
-          place,
-          ctx.template device_context<platform::DeviceContext>(),
-          &out_tmp);
-
-      phi::DenseTensor reverse_axis;
-      std::vector<int> reverse_axis_vector;
-      for (size_t axis = 0; axis < axes.size(); axis++) {
-        if (reverse_vector[axis] == 1) {
-          reverse_axis_vector.push_back(axes[axis]);
-        }
-      }
-      reverse_axis.mutable_data<int>(
-          {static_cast<int>(reverse_axis_vector.size())}, place);
-      paddle::framework::TensorFromVector(
-          reverse_axis_vector, ctx.device_context(), &reverse_axis);
-
-      const auto& runner_reverse =
-          NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out});
-      runner_reverse.Run(stream);
-    }
-
-    if (decrease_axis.size() > 0) {
-      out->Resize(out_dims_origin);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-
-    switch (rank) {
-      case 1:
-        StridedSliceGradCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceGradCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceGradCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceGradCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceGradCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceGradCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 6."));
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto input_dims = input->dims();
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    dx->mutable_data<T>(input_dims, place);
-
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-
-    std::vector<int64_t> out_dims_vector(input_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    input_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    phi::funcs::StridedSliceFunctor(starts.data(),
-                                    ends.data(),
-                                    strides.data(),
-                                    axes.data(),
-                                    reverse_vector.data(),
-                                    input_dims,
-                                    infer_flags,
-                                    decrease_axis,
-                                    starts.size());
-
-    std::vector<int64_t> starts_indices_vector(D, 0);
-    std::vector<int64_t> ends_indices_vector(out_dims_vector.begin(),
-                                             out_dims_vector.end());
-    std::vector<int64_t> strides_indices_vector(D, 1);
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices_vector[axis_index] = starts[axis];
-      ends_indices_vector[axis_index] = ends[axis];
-      strides_indices_vector[axis_index] = strides[axis];
-    }
-
-    phi::DenseTensor starts_indices_tensor;
-    phi::DenseTensor ends_indices_tensor;
-    phi::DenseTensor strides_indices_tensor;
-
-    starts_indices_tensor.mutable_data<int64_t>({D}, place);
-    ends_indices_tensor.mutable_data<int64_t>({D}, place);
-    strides_indices_tensor.mutable_data<int64_t>({D}, place);
-
-    paddle::framework::TensorFromVector(
-        starts_indices_vector, dev_ctx, &starts_indices_tensor);
-    paddle::framework::TensorFromVector(
-        ends_indices_vector, dev_ctx, &ends_indices_tensor);
-    paddle::framework::TensorFromVector(
-        strides_indices_vector, dev_ctx, &strides_indices_tensor);
-
-    std::vector<int64_t> input_dims_vector;
-    for (int i = 0; i < input_dims.size(); i++) {
-      input_dims_vector.push_back(input_dims[i]);
-    }
-    phi::DenseTensor input_dims_tensor;
-    paddle::framework::TensorFromVector(
-        input_dims_vector, dev_ctx, &input_dims_tensor);
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    auto stream = dev_ctx.stream();
-    framework::NPUAttributeMap attr_input = {{"begin_mask", 0},
-                                             {"end_mask", 0},
-                                             {"ellipsis_mask", 0},
-                                             {"new_axis_mask", 0},
-                                             {"shrink_axis_mask", 0}};
-
-    if (need_reverse) {
-      phi::DenseTensor reverse_axis;
-      std::vector<int> reverse_axis_vector;
-      for (size_t axis = 0; axis < axes.size(); axis++) {
-        if (reverse_vector[axis] == 1) {
-          reverse_axis_vector.push_back(axes[axis]);
-        }
-      }
-      reverse_axis.mutable_data<int>(
-          {static_cast<int>(reverse_axis_vector.size())}, place);
-      paddle::framework::TensorFromVector(
-          reverse_axis_vector, dev_ctx, &reverse_axis);
-
-      phi::DenseTensor dout_tmp;
-      dout_tmp.mutable_data<T>(dout->dims(), place);
-      const auto& runner_reverse =
-          NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp});
-      runner_reverse.Run(stream);
-
-      const auto& runner = NpuOpRunner("StridedSliceGrad",
-                                       {input_dims_tensor,
-                                        starts_indices_tensor,
-                                        ends_indices_tensor,
-                                        strides_indices_tensor,
-                                        dout_tmp},
-                                       {*dx},
-                                       attr_input);
-      runner.Run(stream);
-    } else {
-      const auto& runner = NpuOpRunner("StridedSliceGrad",
-                                       {input_dims_tensor,
-                                        starts_indices_tensor,
-                                        ends_indices_tensor,
-                                        strides_indices_tensor,
-                                        *dout},
-                                       {*dx},
-                                       attr_input);
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    strided_slice,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, bool>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, int>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, float>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    strided_slice_grad,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, int>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
deleted file mode 100644
index 5d1656b79e9a8..0000000000000
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using SelectedRows = phi::SelectedRows;
-
-template <typename DeviceContext, typename T>
-class SumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<phi::DenseTensor>()) {
-      auto *out = out_var->GetMutable<phi::DenseTensor>();
-      auto x = ctx.MultiInput<phi::DenseTensor>("X");
-      out->mutable_data<T>(ctx.GetPlace());
-
-      auto place = ctx.GetPlace();
-
-      int n = static_cast<int>(x.size());
-      if (n == 1) {
-        paddle::framework::TensorCopy(*x[0], place, out);
-        return;
-      }
-
-      std::vector<phi::DenseTensor> inputs;
-      std::vector<std::string> names;
-      for (int i = 0; i < n; ++i) {
-        if (x[i] && x[i]->numel() > 0) {
-          inputs.push_back(*x[i]);
-          names.push_back("x" + std::to_string(i));
-        } else {
-          continue;
-        }
-      }
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}};
-      runner.AddInputNames(names);
-      runner.Run(stream);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      auto in_vars = ctx.MultiInputVar("X");
-      bool in_place = out_var == in_vars[0];
-      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-        PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "Only support all inputs are TensorArray, "
-                              "but inputs[%d] is not TensorArray.",
-                              i));
-        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-        for (size_t i = 0; i < in_array.size(); ++i) {
-          if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
-            if (i >= out_array.size()) {
-              out_array.resize(i + 1);
-            }
-            if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
-              framework::TensorCopy(in_array[i],
-                                    in_array[i].place(),
-                                    ctx.device_context(),
-                                    &out_array[i]);
-              out_array[i].set_lod(in_array[i].lod());
-            } else {
-              PADDLE_ENFORCE_EQ(
-                  out_array[i].lod(),
-                  in_array[i].lod(),
-                  platform::errors::InvalidArgument(
-                      "The lod message between inputs[%d] and"
-                      " outputs[%d] must be same, but now is not same.",
-                      i,
-                      i));
-              auto stream = ctx.template device_context<
-                                   paddle::platform::NPUDeviceContext>()
-                                .stream();
-              NpuOpRunner runner{
-                  "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}};
-              runner.Run(stream);
-            }
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be phi::DenseTensor or "
-          "LoDTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    sum,
-    ops::SumNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SumNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
deleted file mode 100644
index 1b3ed3ccc7a73..0000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ /dev/null
@@ -1,1105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void training_or_inference(const framework::ExecutionContext &ctx,
-                           const aclrtStream &stream,
-                           const platform::Place &place,
-                           const DataLayout &layout,
-                           const bool &test_mode,
-                           const int &N,
-                           const int &C,
-                           const int &H,
-                           const int &W,
-                           const float epsilon,
-                           const float &momentum,
-                           const phi::DenseTensor *common_mean,
-                           const phi::DenseTensor *common_var,
-                           const phi::DenseTensor *x,
-                           const phi::DenseTensor *scale,
-                           const phi::DenseTensor *bias,
-                           const phi::DenseTensor *mean,
-                           const phi::DenseTensor *variance,
-                           phi::DenseTensor *mean_out,
-                           phi::DenseTensor *variance_out,
-                           phi::DenseTensor *saved_mean,
-                           phi::DenseTensor *saved_variance,
-                           phi::DenseTensor *y) {
-  std::vector<int> axes;
-  if (layout == phi::DataLayout::kNCHW) {
-    axes = {0, 2, 3};
-  } else if (layout == phi::DataLayout::kNHWC) {
-    axes = {0, 1, 2};
-  }
-
-  std::vector<int> multiples;
-  if (layout == phi::DataLayout::kNCHW)
-    multiples = {N, 1, H, W};
-  else if (layout == phi::DataLayout::kNHWC)
-    multiples = {N, H, W, 1};
-
-  phi::DenseTensor common_mean_tile_1;
-  {
-    common_mean_tile_1.Resize({C});
-    common_mean_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      common_mean_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      common_mean_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor common_mean_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    common_mean_tile.Resize(x->dims());
-    common_mean_tile.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "TileD", {common_mean_tile_1}, {common_mean_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_1;
-  {
-    common_var_tile_1.Resize({C});
-    common_var_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      common_var_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      common_var_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor common_var_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    common_var_tile.Resize(x->dims());
-    common_var_tile.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "TileD", {common_var_tile_1}, {common_var_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_add_epsilon;
-  {
-    framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-    common_var_tile_add_epsilon.Resize(x->dims());
-    common_var_tile_add_epsilon.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "Adds", {common_var_tile}, {common_var_tile_add_epsilon}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_add_epsilon_sqrt;
-  {
-    common_var_tile_add_epsilon_sqrt.Resize(x->dims());
-    common_var_tile_add_epsilon_sqrt.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner("Sqrt",
-                                     {common_var_tile_add_epsilon},
-                                     {common_var_tile_add_epsilon_sqrt},
-                                     {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor x_sub_common_mean;
-  {
-    x_sub_common_mean.Resize(x->dims());
-    x_sub_common_mean.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("Sub", {*x, common_mean_tile}, {x_sub_common_mean}, {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor normalized;
-  {
-    normalized.Resize(x->dims());
-    normalized.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("Div",
-                    {x_sub_common_mean, common_var_tile_add_epsilon_sqrt},
-                    {normalized},
-                    {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor scale_tile_1;
-  {
-    scale_tile_1.Resize({C});
-    scale_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      scale_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      scale_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor scale_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    scale_tile.Resize(x->dims());
-    scale_tile.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor normalized_mul_scale;
-  {
-    normalized_mul_scale.Resize(x->dims());
-    normalized_mul_scale.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "Mul", {normalized, scale_tile}, {normalized_mul_scale}, {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor bias_tile_1;
-  {
-    bias_tile_1.Resize({C});
-    bias_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*bias, place, &bias_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      bias_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      bias_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor bias_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    bias_tile.Resize(x->dims());
-    bias_tile.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("TileD", {bias_tile_1}, {bias_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  // calculate y
-  {
-    y->mutable_data<T>(place);
-    const auto &runner =
-        NpuOpRunner("Add", {normalized_mul_scale, bias_tile}, {*y}, {});
-    runner.Run(stream);
-  }
-
-  if (!test_mode) {
-    phi::DenseTensor ones;
-    {
-      ones.Resize({C});
-      ones.mutable_data<float>(place);
-      FillNpuTensorWithConstant<float>(&ones, 1);
-    }
-
-    // cacl mean_out
-    {
-      phi::DenseTensor common_mean_mul_1_sub_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
-        common_mean_mul_1_sub_momentum.Resize({C});
-        common_mean_mul_1_sub_momentum.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("Muls",
-                                         {*common_mean},
-                                         {common_mean_mul_1_sub_momentum},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor mean_mul_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", momentum}};
-        mean_mul_momentum.Resize({C});
-        mean_mul_momentum.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Muls", {*mean}, {mean_mul_momentum}, attr_input);
-        runner.Run(stream);
-      }
-
-      mean_out->mutable_data<float>(place);
-
-      const auto &runner =
-          NpuOpRunner("Add",
-                      {common_mean_mul_1_sub_momentum, mean_mul_momentum},
-                      {*mean_out},
-                      {});
-      runner.Run(stream);
-    }
-
-    // cacl variance_out
-    {
-      phi::DenseTensor momentum_mul_var;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", momentum}};
-        momentum_mul_var.Resize({C});
-        momentum_mul_var.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Muls", {*variance}, {momentum_mul_var}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor var_ref_mul_1_sub_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
-        var_ref_mul_1_sub_momentum.Resize({C});
-        var_ref_mul_1_sub_momentum.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Muls", {*common_var}, {var_ref_mul_1_sub_momentum}, attr_input);
-        runner.Run(stream);
-      }
-
-      variance_out->mutable_data<float>(place);
-
-      const auto &runner =
-          NpuOpRunner("Add",
-                      {var_ref_mul_1_sub_momentum, momentum_mul_var},
-                      {*variance_out},
-                      {});
-      runner.Run(stream);
-    }
-
-    // cacl saved_variance
-    {
-      phi::DenseTensor var_ref_add_epsilon;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-        var_ref_add_epsilon.Resize({C});
-        var_ref_add_epsilon.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Adds", {*common_var}, {var_ref_add_epsilon}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor var_ref_add_epsilon_sqrt;
-      {
-        var_ref_add_epsilon_sqrt.Resize({C});
-        var_ref_add_epsilon_sqrt.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Sqrt", {var_ref_add_epsilon}, {var_ref_add_epsilon_sqrt}, {});
-        runner.Run(stream);
-      }
-
-      saved_variance->mutable_data<float>(place);
-
-      const auto &runner = NpuOpRunner(
-          "Div", {ones, var_ref_add_epsilon_sqrt}, {*saved_variance}, {});
-      runner.Run(stream);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims,
-                          x_dims.size()));
-
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-
-    int x_numel = x->numel();
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    if (layout == phi::DataLayout::kNCHW) {
-      axes = {0, 2, 3};
-    } else if (layout == phi::DataLayout::kNHWC) {
-      axes = {0, 1, 2};
-    }
-
-    bool test_mode = is_test && (!trainable_stats);
-    if (test_mode) {  // inference
-      // cacl saved_mean
-      saved_mean->mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*mean, place, saved_mean);
-
-      // cacl saved_variance
-      saved_variance->mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*variance, place, saved_variance);
-
-      // cacl y
-      training_or_inference<T>(ctx,
-                               stream,
-                               place,
-                               layout,
-                               test_mode,
-                               N,
-                               C,
-                               H,
-                               W,
-                               epsilon,
-                               momentum,
-                               mean,
-                               variance,
-                               x,
-                               scale,
-                               bias,
-                               mean,
-                               variance,
-                               NULL,
-                               NULL,
-                               NULL,
-                               NULL,
-                               y);
-
-    } else {  // training
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // cacl saved_mean and var_ref
-      phi::DenseTensor var_ref;
-      var_ref.Resize({C});
-      var_ref.mutable_data<float>(place);
-      {
-        phi::DenseTensor x_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_sum.Resize({C});
-          x_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {*x}, {x_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square;
-        {
-          x_square.Resize(x->dims());
-          x_square.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {});
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_square_sum.Resize({C});
-          x_square_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place);
-
-        float device_counts = 0.0;
-        if (comm) {
-          HcclDataType dtype = platform::ToHCCLDataType(
-              framework::TransToProtoVarType(mean_out->dtype()));
-
-          phi::DenseTensor device_count_tensor;
-          {
-            device_count_tensor.Resize({1});
-            device_count_tensor.mutable_data<float>(place);
-            FillNpuTensorWithConstant<float>(&device_count_tensor, 1);
-          }
-
-          // HcclAllReduce device_count_tensor
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(device_count_tensor.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                1,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-
-          std::vector<float> device_count_vec(1);
-          paddle::framework::TensorToVector(
-              device_count_tensor, ctx.device_context(), &device_count_vec);
-          device_counts = device_count_vec[0];
-
-          // HcclAllReduce x_sum
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(x_sum.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                C,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-
-          // HcclAllReduce x_square_sum
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(x_square_sum.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                C,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-        }
-
-        // cacl saved_mean
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f * C / x_numel / device_counts}};
-          saved_mean->mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Muls", {x_sum}, {*saved_mean}, attr_input);
-          runner.Run(stream);
-        }
-
-        // cacl var_ref
-        {
-          phi::DenseTensor saved_mean_square;
-          {
-            saved_mean_square.Resize({C});
-            saved_mean_square.mutable_data<float>(place);
-            const auto &runner =
-                NpuOpRunner("Square", {*saved_mean}, {saved_mean_square}, {});
-            runner.Run(stream);
-          }
-
-          phi::DenseTensor var_ref_tmp;
-          var_ref_tmp.Resize({C});
-          var_ref_tmp.mutable_data<float>(place);
-          {
-            framework::NPUAttributeMap attr_input = {
-                {"value", 1.0f * C / x_numel / device_counts}};
-            const auto &runner =
-                NpuOpRunner("Muls", {x_square_sum}, {var_ref_tmp}, attr_input);
-            runner.Run(stream);
-          }
-
-          // cacl var_ref
-          {
-            const auto &runner = NpuOpRunner(
-                "Sub", {var_ref_tmp, saved_mean_square}, {var_ref}, {});
-            runner.Run(stream);
-          }
-        }
-      }
-
-      training_or_inference<T>(ctx,
-                               stream,
-                               place,
-                               layout,
-                               test_mode,
-                               N,
-                               C,
-                               H,
-                               W,
-                               epsilon,
-                               momentum,
-                               saved_mean,
-                               &var_ref,
-                               x,
-                               scale,
-                               bias,
-                               mean,
-                               variance,
-                               mean_out,
-                               variance_out,
-                               saved_mean,
-                               saved_variance,
-                               y);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    float epsilon = ctx.Attr<float>("epsilon");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-
-    const phi::DenseTensor *x;
-    if (ctx.HasInput("Y")) {
-      PADDLE_ENFORCE_EQ(true,
-                        false,
-                        platform::errors::InvalidArgument(
-                            "sync_batch_norm_grad doesn't support input Y"));
-    } else {
-      x = ctx.Input<phi::DenseTensor>("X");
-    }
-
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D);
-
-    int x_numel = x->numel();
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    if (layout == phi::DataLayout::kNCHW) {
-      axes = {0, 2, 3};
-    } else if (layout == phi::DataLayout::kNHWC) {
-      axes = {0, 1, 2};
-    }
-
-    std::vector<int> multiples;
-    if (layout == phi::DataLayout::kNCHW)
-      multiples = {N, 1, H, W};
-    else if (layout == phi::DataLayout::kNHWC)
-      multiples = {N, H, W, 1};
-
-    auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place);
-    HcclDataType dtype = platform::ToHCCLDataType(
-        framework::TransToProtoVarType(scale->dtype()));
-
-    float device_counts = 0.0;
-    if (comm) {
-      phi::DenseTensor device_count_tensor;
-      {
-        device_count_tensor.Resize({1});
-        device_count_tensor.mutable_data<float>(place);
-        FillNpuTensorWithConstant<float>(&device_count_tensor, 1);
-      }
-
-      // HcclAllReduce device_count_tensor
-      {
-        void *sendbuff = reinterpret_cast<void *>(
-            const_cast<float *>(device_count_tensor.data<float>()));
-        void *recvbuff = sendbuff;
-        PADDLE_ENFORCE_NPU_SUCCESS(
-            platform::dynload::HcclAllReduce(sendbuff,
-                                             recvbuff,
-                                             1,
-                                             dtype,
-                                             HCCL_REDUCE_SUM,
-                                             comm->comm(),
-                                             reinterpret_cast<void *>(stream)));
-      }
-
-      std::vector<float> device_count_vec(1);
-      paddle::framework::TensorToVector(
-          device_count_tensor, ctx.device_context(), &device_count_vec);
-      device_counts = device_count_vec[0];
-      PADDLE_ENFORCE_GE(
-          device_counts,
-          2,
-          platform::errors::PreconditionNotMet("device_counts should >= 2."));
-    }
-
-    // cacl var_ref
-    phi::DenseTensor var_ref;
-    var_ref.Resize({C});
-    var_ref.mutable_data<float>(place);
-    {
-      // cacl var_ref
-      {
-        phi::DenseTensor x_square;
-        {
-          x_square.Resize(x->dims());
-          x_square.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {});
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_square_sum.Resize({C});
-          x_square_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum_mean;
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f * C / x_numel}};
-          x_square_sum_mean.Resize({C});
-          x_square_sum_mean.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner(
-              "Muls", {x_square_sum}, {x_square_sum_mean}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor mean_square;
-        {
-          mean_square.Resize({C});
-          mean_square.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Square", {*saved_mean}, {mean_square}, {});
-          runner.Run(stream);
-        }
-
-        // cacl var_ref
-        {
-          const auto &runner = NpuOpRunner(
-              "Sub", {x_square_sum_mean, mean_square}, {var_ref}, {});
-          runner.Run(stream);
-        }
-      }
-    }
-
-    phi::DenseTensor saved_mean_tile_1;
-    {
-      saved_mean_tile_1.Resize({C});
-      saved_mean_tile_1.mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1);
-      if (layout == phi::DataLayout::kNCHW)
-        saved_mean_tile_1.Resize({1, C, 1, 1});
-      else if (layout == phi::DataLayout::kNHWC)
-        saved_mean_tile_1.Resize({1, 1, 1, C});
-    }
-
-    phi::DenseTensor saved_mean_tile;
-    {
-      framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-      saved_mean_tile.Resize(x->dims());
-      saved_mean_tile.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner(
-          "TileD", {saved_mean_tile_1}, {saved_mean_tile}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor x_sub_saved_mean;
-    {
-      x_sub_saved_mean.Resize(x->dims());
-      x_sub_saved_mean.mutable_data<float>(place);
-      const auto &runner =
-          NpuOpRunner("Sub", {*x, saved_mean_tile}, {x_sub_saved_mean}, {});
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_1;
-    {
-      var_ref_tile_1.Resize({C});
-      var_ref_tile_1.mutable_data<float>(place);
-      paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1);
-      if (layout == phi::DataLayout::kNCHW)
-        var_ref_tile_1.Resize({1, C, 1, 1});
-      else if (layout == phi::DataLayout::kNHWC)
-        var_ref_tile_1.Resize({1, 1, 1, C});
-    }
-
-    phi::DenseTensor var_ref_tile;
-    {
-      framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-      var_ref_tile.Resize(x->dims());
-      var_ref_tile.mutable_data<float>(place);
-      const auto &runner =
-          NpuOpRunner("TileD", {var_ref_tile_1}, {var_ref_tile}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_add_epsilon;
-    {
-      framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-      var_ref_tile_add_epsilon.Resize(x->dims());
-      var_ref_tile_add_epsilon.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner(
-          "Adds", {var_ref_tile}, {var_ref_tile_add_epsilon}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_add_epsilon_sqrt;
-    {
-      var_ref_tile_add_epsilon_sqrt.Resize(x->dims());
-      var_ref_tile_add_epsilon_sqrt.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner("Sqrt",
-                                       {var_ref_tile_add_epsilon},
-                                       {var_ref_tile_add_epsilon_sqrt},
-                                       {});
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor dy_mul_x_sub_mean_for_scale;
-    {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        dy_mul_x_sub_mean_for_scale.Resize(x->dims());
-        dy_mul_x_sub_mean_for_scale.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {});
-        runner.Run(stream);
-      } else {
-        dy_mul_x_sub_mean_for_scale.Resize(x->dims());
-        dy_mul_x_sub_mean_for_scale.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {});
-        runner.Run(stream);
-      }
-    }
-
-    phi::DenseTensor dy_mul_x_sub_mean;
-    {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        dy_mul_x_sub_mean.Resize(x->dims());
-        dy_mul_x_sub_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {});
-        runner.Run(stream);
-      } else {
-        dy_mul_x_sub_mean.Resize(x->dims());
-        dy_mul_x_sub_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {});
-        runner.Run(stream);
-      }
-    }
-
-    // HcclAllReduce dy_mul_x_sub_mean
-    if (comm) {
-      {
-        void *sendbuff = reinterpret_cast<void *>(
-            const_cast<float *>(dy_mul_x_sub_mean.data<float>()));
-        void *recvbuff = sendbuff;
-        PADDLE_ENFORCE_NPU_SUCCESS(
-            platform::dynload::HcclAllReduce(sendbuff,
-                                             recvbuff,
-                                             C,
-                                             dtype,
-                                             HCCL_REDUCE_SUM,
-                                             comm->comm(),
-                                             reinterpret_cast<void *>(stream)));
-      }
-
-      {
-        framework::NPUAttributeMap attr_input = {
-            {"value", 1.0f / device_counts}};
-        const auto &runner = NpuOpRunner(
-            "Muls", {dy_mul_x_sub_mean}, {dy_mul_x_sub_mean}, attr_input);
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_x
-    if (d_x) {
-      phi::DenseTensor dy_mean;
-      {
-        if (framework::TransToProtoVarType(d_y->dtype()) ==
-            framework::proto::VarType::FP16) {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          dy_mean.Resize({C});
-          dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        } else {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          dy_mean.Resize({C});
-          dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        }
-      }
-
-      // HcclAllReduce dy_mean
-      if (comm) {
-        {
-          void *sendbuff = reinterpret_cast<void *>(
-              const_cast<float *>(dy_mean.data<float>()));
-          void *recvbuff = sendbuff;
-          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-              sendbuff,
-              recvbuff,
-              C,
-              dtype,
-              HCCL_REDUCE_SUM,
-              comm->comm(),
-              reinterpret_cast<void *>(stream)));
-        }
-
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f / device_counts}};
-          const auto &runner =
-              NpuOpRunner("Muls", {dy_mean}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        }
-      }
-
-      phi::DenseTensor dy_mean_tile_1;
-      {
-        dy_mean_tile_1.Resize({C});
-        dy_mean_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          dy_mean_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          dy_mean_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor dy_mean_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        dy_mean_tile.Resize(x->dims());
-        dy_mean_tile.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("TileD", {dy_mean_tile_1}, {dy_mean_tile}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor dy_sub_dy_mean;
-      {
-        if (framework::TransToProtoVarType(d_y->dtype()) ==
-            framework::proto::VarType::FP16) {
-          dy_sub_dy_mean.Resize(x->dims());
-          dy_sub_dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {});
-          runner.Run(stream);
-        } else {
-          dy_sub_dy_mean.Resize(x->dims());
-          dy_sub_dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {});
-          runner.Run(stream);
-        }
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean;
-      {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        dy_mul_x_sub_mean_mean.Resize({C});
-        dy_mul_x_sub_mean_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("ReduceMeanD",
-                                         {dy_mul_x_sub_mean},
-                                         {dy_mul_x_sub_mean_mean},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1;
-      {
-        dy_mul_x_sub_mean_mean_tile_1.Resize({C});
-        dy_mul_x_sub_mean_mean_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(
-            dy_mul_x_sub_mean_mean, place, &dy_mul_x_sub_mean_mean_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        dy_mul_x_sub_mean_mean_tile.Resize(x->dims());
-        dy_mul_x_sub_mean_mean_tile.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("TileD",
-                                         {dy_mul_x_sub_mean_mean_tile_1},
-                                         {dy_mul_x_sub_mean_mean_tile},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      // (x - mean) * np.mean(dy * (x - mean), axis=axis)
-      // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile
-      phi::DenseTensor tmp1;
-      {
-        tmp1.Resize(x->dims());
-        tmp1.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {x_sub_saved_mean, dy_mul_x_sub_mean_mean_tile}, {tmp1}, {});
-        runner.Run(stream);
-      }
-
-      // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon)
-      // tmp1 / (var + epsilon)
-      // tmp1 / var_ref_tile_add_epsilon
-      phi::DenseTensor tmp2;
-      {
-        tmp2.Resize(x->dims());
-        tmp2.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Div", {tmp1, var_ref_tile_add_epsilon}, {tmp2}, {});
-        runner.Run(stream);
-      }
-
-      // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) /
-      // (var + epsilon)
-      // dy_sub_dy_mean - tmp2
-      phi::DenseTensor tmp3;
-      {
-        tmp3.Resize(x->dims());
-        tmp3.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Sub", {dy_sub_dy_mean, tmp2}, {tmp3}, {});
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor scale_tile_1;
-      {
-        scale_tile_1.Resize({C});
-        scale_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          scale_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          scale_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor scale_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        scale_tile.Resize(x->dims());
-        scale_tile.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input);
-        runner.Run(stream);
-      }
-
-      // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean),
-      // axis) / (var + epsilon))
-      // scale * tmp3
-      phi::DenseTensor dx_1;
-      {
-        dx_1.Resize(x->dims());
-        dx_1.mutable_data<float>(place);
-
-        const auto &runner = NpuOpRunner("Mul", {scale_tile, tmp3}, {dx_1}, {});
-        runner.Run(stream);
-      }
-
-      // dx_1 / var_ref_tile_add_epsilon_sqrt
-      {
-        d_x->Resize(x->dims());
-        d_x->mutable_data<T>(place);
-        const auto &runner = NpuOpRunner(
-            "Div", {dx_1, var_ref_tile_add_epsilon_sqrt}, {*d_x}, {});
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_scale
-    if (d_scale) {
-      phi::DenseTensor d_scale_2;
-      {
-        d_scale_2.Resize(x->dims());
-        d_scale_2.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Div",
-            {dy_mul_x_sub_mean_for_scale, var_ref_tile_add_epsilon_sqrt},
-            {d_scale_2},
-            {});
-        runner.Run(stream);
-      }
-
-      {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_scale->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {d_scale_2}, {*d_scale}, attr_input);
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_bias
-    if (d_bias) {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_bias->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input);
-        runner.Run(stream);
-      } else {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_bias->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input);
-        runner.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormNPUKernel<plat::NPUDeviceContext, float>);
-REGISTER_OP_NPU_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormNPUGradKernel<plat::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
deleted file mode 100644
index ce10caf1b2e19..0000000000000
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
-// cmake/operators.cmake when Paddle supports
-#if (CANN_VERSION_CODE >= 504000)
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<phi::DenseTensor>("Index");
-    auto result = ctx.Output<phi::DenseTensor>("Result");
-    result->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner(
-        "GatherElements", {*input, *index}, {*result}, {{"dim", axis}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NPUTakeAlongAxisGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<phi::DenseTensor>("Index");
-    auto result_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Result"));
-
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ScatterAddWithAxis",
-                                     {*input_grad, *index, *result_grad},
-                                     {*input_grad},
-                                     {{"axis", axis}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    take_along_axis,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, double>)
-REGISTER_OP_NPU_KERNEL(
-    take_along_axis_grad,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext,
-                                    int64_t>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, double>)
-
-#endif
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
deleted file mode 100644
index 2e3ab9dac0461..0000000000000
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/tile_op_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class TileNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op must be a positive "
-            "integer, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    Tile(context);
-  }
-
- protected:
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i],
-          0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(),
-        vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(),
-            repeat_times.size()));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> temp(repeat_times.size(), 1);
-    if (repeat_times == temp) {
-      framework::TensorCopy(*in0,
-                            context.GetPlace(),
-                            context.template device_context<NPUDeviceContext>(),
-                            out0);
-      return;
-    }
-
-    // const auto& runner =
-    //     NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}});
-    auto stream = context.template device_context<NPUDeviceContext>().stream();
-    NpuOpRunner runner;
-    runner.SetType("Tile")
-        .AddInput(*in0)
-        .AddInput(std::move(repeat_times))
-        .AddOutput(*out0)
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(tile,
-                       ops::TileNPUKernel<float>,
-                       ops::TileNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::TileNPUKernel<int64_t>,
-#endif
-                       ops::TileNPUKernel<bool>,
-                       ops::TileNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
deleted file mode 100644
index 478523721458d..0000000000000
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/top_k_op.h"
-
-namespace paddle {
-namespace operators {
-
-void gen_assist_seq(phi::DenseTensor* assit_tensor,
-                    int64_t dim,
-                    const framework::ExecutionContext& ctx) {
-  const int64_t dimx2 = dim;
-  std::vector<paddle::platform::float16> assit;
-  assit.resize(2 * dimx2);
-  for (int64_t i = 0; i < dimx2; i++) {
-    // for i in range [0, dim]
-    assit[i] = static_cast<paddle::platform::float16>(i);
-
-    // for i in range [dim, dimx2]
-    int64_t idx =
-        static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
-    int64_t gap = i - idx;
-    assit[i + dim] = static_cast<paddle::platform::float16>(gap);
-  }
-  framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
-}
-
-template <typename DeviceContext, typename T>
-class TopkNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // read input
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-
-    output->mutable_data<T>(ctx.GetPlace());
-    indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // prepare assit
-    auto size = input->dims().size();
-    // dim is the last dimension of input
-    auto dim = input->dims()[size - 1];
-    phi::DenseTensor assist_seq_tensor;
-    assist_seq_tensor.Resize({2 * dim});
-    assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
-    gen_assist_seq(&assist_seq_tensor, dim, ctx);
-
-    framework::NPUAttributeMap attr_input = {{"sorted", "true"},
-                                             {"k", static_cast<int>(k)},
-                                             {"dim", -1},
-                                             {"largest", true}};
-
-    phi::DenseTensor tmp_indices(phi::DataType::INT32);
-    tmp_indices.Resize(indices->dims());
-    tmp_indices.mutable_data<int>(ctx.GetPlace());
-
-    // run ascend
-    const auto& runner = NpuOpRunner("TopKD",
-                                     {*input, assist_seq_tensor},
-                                     {*output, tmp_indices},
-                                     attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    // cast indices from INT32 to INT64
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(indices->dtype()));
-    const auto& runner_cast_indices =
-        NpuOpRunner("Cast",
-                    {tmp_indices},
-                    {*indices},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_indices.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// Ascend Op TopKD only support input float 16 dtype
-REGISTER_OP_NPU_KERNEL(top_k,
-                       ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
-                                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
deleted file mode 100644
index 4e0b0650b9af6..0000000000000
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel
-// may lead to large accuracy error for float32 data
-template <typename T>
-class TopkV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* k_tensor = context.Input<phi::DenseTensor>("K");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto* indices = context.Output<phi::DenseTensor>("Indices");  // type: INT64
-
-    int32_t k = static_cast<int32_t>(context.Attr<int>("k"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    const bool sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const bool largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    if (axis < 0) {
-      axis += input->dims().size();
-    }
-
-    if (k_tensor != nullptr) {
-      std::vector<int> v_tmp(1);
-      paddle::framework::TensorToVector(
-          *k_tensor,
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          &v_tmp);
-      k = static_cast<int32_t>(v_tmp[0]);
-    }
-
-    framework::DDim output_dims = input->dims();
-    output_dims[axis] = k;
-
-    out->Resize(output_dims);
-    indices->Resize(output_dims);
-
-    out->mutable_data<T>(context.GetPlace());
-    indices->mutable_data<int64_t>(context.GetPlace());
-
-    phi::DenseTensor indices_int32(phi::DataType::INT32);
-    indices_int32.Resize(output_dims);
-    indices_int32.mutable_data<int32_t>(context.GetPlace());
-
-    auto npu_stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner npu_op_runner_topkv2;
-    npu_op_runner_topkv2.SetType("TopKV2")
-        .AddInput(*input)
-        .AddInput(std::vector<int32_t>{k})
-        .AddOutput(*out)
-        .AddOutput(indices_int32)
-        .AddAttr("sorted", sorted)
-        .AddAttr("dim", axis)
-        .AddAttr("largest", largest)
-        .Run(npu_stream);
-
-    // Cast 'indices_int32' to 'indices', from INT32 to INT64
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(indices->type()));
-    const auto& npu_op_runner_cast =
-        NpuOpRunner("Cast",
-                    {indices_int32},
-                    {*indices},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    npu_op_runner_cast.Run(npu_stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(top_k_v2,
-                       ops::TopkV2NPUKernel<float>,
-                       ops::TopkV2NPUKernel<plat::float16>,
-                       ops::TopkV2NPUKernel<double>,
-                       ops::TopkV2NPUKernel<int32_t>,
-                       ops::TopkV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
deleted file mode 100644
index 5af2edd60ce8f..0000000000000
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    out->mutable_data<T>(ctx.device_context().GetPlace());
-    NpuOpRunner runner;
-    runner.SetType("Transpose")
-        .AddInput(*x)
-        .AddInput(std::move(axis))
-        .AddOutput(*out);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class TransposeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    NpuOpRunner runner;
-    runner.SetType("Transpose")
-        .AddInput(*out_grad)
-        .AddInput(std::move(reversed_axis))
-        .AddOutput(*x_grad);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    transpose2,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
-
-REGISTER_OP_NPU_KERNEL(transpose2_grad,
-                       ops::TransposeGradNPUKernel<float>,
-                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
-                       ops::TransposeGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::TransposeGradNPUKernel<int64_t>,
-#endif
-                       ops::TransposeGradNPUKernel<uint8_t>,
-                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
deleted file mode 100644
index 0ef5af349decf..0000000000000
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <cmath>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(transpose2);
-USE_OP_DEVICE_KERNEL(transpose2, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto out = scope->Var("Out");
-  auto xshape = scope->Var("XShape");
-  auto* x_t = x->GetMutable<phi::DenseTensor>();
-  auto* out_t = out->GetMutable<phi::DenseTensor>();
-  auto* xshape_t = xshape->GetMutable<phi::DenseTensor>();
-  auto place = ctx.GetPlace();
-
-  int dim0 = 2;
-  int dim1 = 3;
-  paddle::framework::TensorFromVector(
-      std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
-  ctx.Wait();
-  x_t->Resize({dim0, dim1});
-  out_t->Resize({dim0, dim1});
-  ctx.Wait();
-  out_t->mutable_data<T>(place);
-  ctx.Wait();
-  xshape_t->Resize({dim0, dim1});
-  xshape_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
-                           {"data_format", std::string("AnyLayout")}};
-  auto op = f::OpRegistry::CreateOp("transpose2",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
-                                    attrs);
-  ctx.Wait();
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> out_v;
-  paddle::framework::TensorToVector(*out_t, ctx, &out_v);
-  ctx.Wait();
-
-  EXPECT_EQ(out_t->numel(), dim0 * dim1);
-  EXPECT_EQ(out_v[0], 0);
-  EXPECT_EQ(out_v[1], 3);
-  EXPECT_EQ(out_v[2], 1);
-  EXPECT_EQ(out_v[3], 4);
-  EXPECT_EQ(out_v[4], 2);
-  EXPECT_EQ(out_v[5], 5);
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto xshape = scope->Var("XShape");
-  auto x_grad = scope->Var("X@GRAD");
-  auto out_grad = scope->Var("Out@GRAD");
-
-  auto* x_grad_t = x_grad->GetMutable<phi::DenseTensor>();
-  auto* xshape_t = xshape->GetMutable<phi::DenseTensor>();
-  auto* out_grad_t = out_grad->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 2;
-  int dim1 = 3;
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(
-      std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
-  ctx.Wait();
-
-  x_grad_t->Resize({dim0, dim1});
-  xshape_t->Resize(
-      {0,
-       dim0,
-       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
-  out_grad_t->Resize({dim0, dim1});
-
-  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
-                           {"data_format", std::string("AnyLayout")}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "transpose2_grad",
-      {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
-      {{"X@GRAD", {"X@GRAD"}}},
-      attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> out_v;
-  paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v);
-  ctx.Wait();
-
-  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
-  EXPECT_EQ(out_v[0], 0);
-  EXPECT_EQ(out_v[1], 3);
-  EXPECT_EQ(out_v[2], 1);
-  EXPECT_EQ(out_v[3], 4);
-  EXPECT_EQ(out_v[4], 2);
-  EXPECT_EQ(out_v[5], 5);
-}
-
-TEST(transpose2, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(transpose2_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
deleted file mode 100644
index b47797a5bb131..0000000000000
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TrilTriuNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int diagonal = ctx.Attr<int>("diagonal");
-    bool lower = ctx.Attr<bool>("lower");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::string op_type = lower ? "Tril" : "Triu";
-
-    framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}};
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto op_func_tril = [](const std::vector<Tensor>& inputs,
-                           const std::vector<Tensor>& outputs,
-                           const NPUAttributeMap& attrs,
-                           const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    auto op_func_triu = [](const std::vector<Tensor>& inputs,
-                           const std::vector<Tensor>& outputs,
-                           const NPUAttributeMap& attrs,
-                           const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::BOOL) {
-      if (lower) {
-        NpuOpRunner::TypeAdapter({*x},
-                                 {*out},
-                                 attr_input,
-                                 dev_ctx,
-                                 op_func_tril,
-                                 {framework::proto::VarType::UINT8},
-                                 {framework::proto::VarType::UINT8});
-      } else {
-        NpuOpRunner::TypeAdapter({*x},
-                                 {*out},
-                                 attr_input,
-                                 dev_ctx,
-                                 op_func_triu,
-                                 {framework::proto::VarType::UINT8},
-                                 {framework::proto::VarType::UINT8});
-      }
-    } else {
-      const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    tril_triu,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, float>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, int>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, bool>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
deleted file mode 100644
index da9fa93130bd1..0000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
-    std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
-    phi::DenseTensor shape_tensor(phi::DataType::INT32);
-    shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
-                                       ctx.GetPlace());
-    paddle::framework::TensorFromVector(
-        shape, ctx.device_context(), &shape_tensor);
-    float mean = ctx.Attr<float>("mean");
-    phi::DenseTensor mean_tensor(phi::DataType::FLOAT32);
-    mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&mean_tensor, mean);
-
-    float std = ctx.Attr<float>("std");
-    phi::DenseTensor std_tensor(phi::DataType::FLOAT32);
-    std_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&std_tensor, std);
-
-    int32_t seed_var = ctx.Attr<int32_t>("seed");
-
-    phi::DenseTensor min_tensor(phi::DataType::FLOAT32);
-    min_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float min_value = mean - std * 2.0;
-    FillNpuTensorWithConstant<float>(&min_tensor, min_value);
-
-    phi::DenseTensor max_tensor(phi::DataType::FLOAT32);
-    max_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float max_value = mean + std * 2.0;
-    FillNpuTensorWithConstant<float>(&max_tensor, max_value);
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner(
-        "ParameterizedTruncatedNormal",
-        {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor},
-        {*out},
-        {{"seed", seed_var}});
-    runner.Run(stream);
-  }
-};
-
-// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the
-// above
-// npu version work in the future.
-template <typename T>
-class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = truncated_normal(dist(*engine));
-    }
-    framework::TensorCopy(
-        cpu_tensor,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        tensor);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(truncated_gaussian_random,
-                       ops::NPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
deleted file mode 100644
index 5958a7751b8be..0000000000000
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    phi::DenseTensor *tensor = nullptr;
-    auto out_var = ctx.OutputVar("Out");
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-
-    if (out_var->IsType<phi::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<phi::SelectedRows>();
-      tensor = selected_rows->mutable_value();
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      if (!new_shape.empty()) shape = new_shape;
-      tensor->Resize(phi::make_ddim(shape));
-      selected_rows->mutable_rows()->reserve(shape[0]);
-    } else if (out_var->IsType<phi::DenseTensor>()) {
-      tensor = out_var->GetMutable<phi::DenseTensor>();
-      if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be "
-          "phi::DenseTensor, "
-          "SelectedRows. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-    tensor->mutable_data<T>(ctx.GetPlace());
-    int64_t size = tensor->numel();
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data_cpu[i] = dist(*engine);
-    }
-
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    if (diag_num > 0) {
-      PADDLE_ENFORCE_GT(
-          size,
-          (diag_num - 1) * (diag_step + 1),
-          platform::errors::InvalidArgument(
-              "ShapeInvalid: the diagonal's elements is equal (num-1) "
-              "* (step-1) with num %d, step %d,"
-              "It should be smaller than %d, but received %d",
-              diag_num,
-              diag_step,
-              (diag_num - 1) * (diag_step + 1),
-              size));
-      for (int64_t i = 0; i < diag_num; ++i) {
-        int64_t pos = i * diag_step + i;
-        data_cpu[pos] = diag_val;
-      }
-    }
-
-    // copy to NPU
-    framework::TensorCopy(
-        cpu_tensor,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        tensor);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(uniform_random,
-                       paddle::operators::NPUUniformRandomKernel<float>);
diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc
deleted file mode 100644
index b2b09faaa9d44..0000000000000
--- a/paddle/fluid/operators/unsqueeze_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
deleted file mode 100644
index bf66941f90278..0000000000000
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(unsqueeze);
-USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 5;
-  int dim1 = 10;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < dim0 * dim1; ++i) {
-    init.push_back(static_cast<T>(0.1));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({dim0, dim1});
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> axis;
-  axis.push_back(1);
-  f::AttributeMap attrs = {{"axes", axis}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "unsqueeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
-  }
-
-  ctx.Wait();
-}
-
-TEST(unsqueeze, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc
deleted file mode 100644
index 4c1aa39168b69..0000000000000
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class UnStackNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<phi::DenseTensor>("X");
-    auto dx = ctx.MultiOutput<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int num = dy->dims()[axis];
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> dx_list;
-    for (int i = 0; i < num; i++) {
-      dx[i]->mutable_data<T>(ctx.GetPlace());
-      dx_list.push_back(*dx[i]);
-    }
-
-    const auto &runner =
-        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnStackGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> x_list;
-    for (int i = 0; i < num; i++) {
-      x_list.push_back(*x[i]);
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const auto &runner =
-        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    unstack,
-    ops::UnStackNPUKernel<plat::NPUDeviceContext, float>,
-    ops::UnStackNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    unstack_grad,
-    ops::UnStackGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::UnStackGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
deleted file mode 100644
index b5c61e6b988aa..0000000000000
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::NPUDeviceContext>();
-    auto* condition = context.Input<phi::DenseTensor>("Condition");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto place = context.GetPlace();
-    const aclrtStream& stream = dev_ctx.stream();
-
-    // Run Cast and ReduceSum to get 0 dim of Out
-    phi::DenseTensor booled_cond;
-    if (framework::TransToProtoVarType(condition->dtype()) !=
-        framework::proto::VarType::BOOL) {
-      auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL);
-      booled_cond.mutable_data<bool>(dims, place);
-      const auto& booled_runner =
-          NpuOpRunner("Cast",
-                      {*condition},
-                      {booled_cond},
-                      {{"dst_type", static_cast<int>(bool_type)}});
-      booled_runner.Run(stream);
-    } else {
-      booled_cond.ShareDataWith(*condition);
-    }
-    phi::DenseTensor casted_cond;
-    auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64);
-    casted_cond.mutable_data<int64_t>(dims, place);
-    const auto& cast_runner =
-        NpuOpRunner("Cast",
-                    {booled_cond},
-                    {casted_cond},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    cast_runner.Run(stream);
-
-    phi::DenseTensor sumed_true_num;
-    sumed_true_num.mutable_data<int64_t>({1}, place);
-    phi::DenseTensor cond_axes;
-    cond_axes.mutable_data<int>({dims.size()}, place);
-    std::vector<int> axes_vec;
-    for (int i = 0; i < dims.size(); ++i) {
-      axes_vec.push_back(i);
-    }
-    framework::TensorFromVector<int>(axes_vec, dev_ctx, &cond_axes);
-    const auto& sum_runner = NpuOpRunner("ReduceSum",
-                                         {casted_cond, cond_axes},
-                                         {sumed_true_num},
-                                         {{"keep_dims", false}});
-    sum_runner.Run(stream);
-
-    phi::DenseTensor local_true_num;
-    paddle::framework::TensorCopySync(
-        sumed_true_num, platform::CPUPlace(), &local_true_num);
-    auto true_num = *local_true_num.data<int64_t>();
-
-    out->Resize(phi::make_ddim({true_num, rank}));
-    out->mutable_data<int64_t>(place);
-
-    if (true_num == 0) {
-      return;
-    }
-
-    out->set_layout(DataLayout::kAnyLayout);
-    NpuOpRunner runner{"Where", {*condition}, {*out}};
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(where_index,
-                       ops::NPUWhereIndexKernel<int64_t>,
-                       ops::NPUWhereIndexKernel<int>,
-                       ops::NPUWhereIndexKernel<bool>,
-                       ops::NPUWhereIndexKernel<float>,
-                       ops::NPUWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
deleted file mode 100644
index e1af771f947bb..0000000000000
--- a/paddle/fluid/operators/where_op_npu.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("Select", {*condition, *X, *Y}, {*out}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
-    auto* dout_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    if (dx_t != nullptr) {
-      dx_t->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dy_t != nullptr) {
-      dy_t->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_zeros(dout_t->dtype());
-    tensor_zeros.mutable_data<T>(dout_t->dims(), ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {});
-    runner.Run(stream);
-
-    if (dx_t != nullptr) {
-      const auto& runner = NpuOpRunner(
-          "Select", {*condition, *dout_t, tensor_zeros}, {*dx_t}, {});
-      runner.Run(stream);
-    }
-    if (dy_t != nullptr) {
-      const auto& runner = NpuOpRunner(
-          "Select", {*condition, tensor_zeros, *dout_t}, {*dy_t}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    where,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    where_grad,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);