diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
deleted file mode 100644
index 937cd46d52888..0000000000000
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ /dev/null
@@ -1,588 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/phi/common/data_type.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class NPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename T>
-class BeamSearchFunctor<platform::NPUDeviceContext, T> {
- public:
-  void operator()(const platform::NPUDeviceContext& ctx,
-                  const phi::DenseTensor* pre_ids,
-                  const phi::DenseTensor* pre_scores,
-                  const phi::DenseTensor* ids,
-                  const phi::DenseTensor* scores,
-                  phi::DenseTensor* selected_ids,
-                  phi::DenseTensor* selected_scores,
-                  phi::DenseTensor* parent_idx,
-                  size_t level,
-                  size_t beam_size,
-                  int end_id,
-                  bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-    auto& high_level = abs_lod[level];
-
-    int64_t num_seqs = scores->NumElements(level);
-    // size of the first beam is 1, others are equal to beam_size
-    int64_t real_beam_size = static_cast<int64_t>(scores->dims()[0] / num_seqs);
-    // K
-    int64_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream = ctx.stream();
-
-    int64_t total_length = num_seqs * beam_size;
-    int64_t batch_size = static_cast<int64_t>(scores->dims()[0]);
-    selected_ids->mutable_data<int64_t>(phi::make_ddim({total_length, 1}),
-                                        place);
-    selected_scores->mutable_data<float>(phi::make_ddim({total_length, 1}),
-                                         place);
-    parent_idx->mutable_data<int64_t>(phi::make_ddim({total_length}), place);
-
-    // Step1: Define Tensors and Preprocess the situation that pre_id == end_id
-
-    // cast ids and pre_ids from int to float32
-    Tensor ids_int32(phi::DataType::INT32);
-    if (framework::TransToProtoVarType(ids->dtype()) !=
-        framework::proto::VarType::INT32) {
-      ids_int32.Resize(ids->dims());
-      ids_int32.mutable_data<int>(ctx.GetPlace());
-      auto dst_dtype_ids_int32 =
-          ConvertToNpuDtype(framework::TransToProtoVarType(ids_int32.dtype()));
-      const auto& runner_ids_int32 =
-          NpuOpRunner("Cast",
-                      {*ids},
-                      {ids_int32},
-                      {{"dst_type", static_cast<int>(dst_dtype_ids_int32)}});
-      runner_ids_int32.Run(stream);
-    } else {
-      ids_int32.ShareDataWith(*ids);
-    }
-
-    Tensor pre_ids_int32(phi::DataType::INT32);
-    if (framework::TransToProtoVarType(pre_ids->dtype()) !=
-        framework::proto::VarType::INT32) {
-      pre_ids_int32.Resize(pre_ids->dims());
-      pre_ids_int32.mutable_data<int>(ctx.GetPlace());
-      auto dst_dtype_pre_ids_int32 = ConvertToNpuDtype(
-          framework::TransToProtoVarType(pre_ids_int32.dtype()));
-      const auto& runner_pre_ids_int32 = NpuOpRunner(
-          "Cast",
-          {*pre_ids},
-          {pre_ids_int32},
-          {{"dst_type", static_cast<int>(dst_dtype_pre_ids_int32)}});
-      runner_pre_ids_int32.Run(stream);
-    } else {
-      pre_ids_int32.ShareDataWith(*pre_ids);
-    }
-
-    Tensor expand_pre_ids(pre_ids_int32.dtype());
-    expand_pre_ids.Resize(phi::make_ddim({batch_size, seq_width}));
-    expand_pre_ids.mutable_data<int>(place);
-    const auto& runner_tile_pre_ids =
-        NpuOpRunner("TileWithAxis",
-                    {pre_ids_int32},
-                    {expand_pre_ids},
-                    {{"axis", 1}, {"tiles", seq_width}});
-    runner_tile_pre_ids.Run(stream);
-    expand_pre_ids.Resize(ids_int32.dims());
-
-    Tensor expand_pre_scores(pre_scores->dtype());
-    expand_pre_scores.Resize(phi::make_ddim({batch_size, seq_width}));
-    expand_pre_scores.mutable_data<float>(place);
-    const auto& runner_tile_pre_scores =
-        NpuOpRunner("TileWithAxis",
-                    {*pre_scores},
-                    {expand_pre_scores},
-                    {{"axis", 1}, {"tiles", seq_width}});
-    runner_tile_pre_scores.Run(stream);
-    expand_pre_scores.Resize(scores->dims());
-
-    // End_id Tensors
-    Tensor end_id_tmp_tensor(phi::DataType::INT32);
-    end_id_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&end_id_tmp_tensor, end_id);
-
-    Tensor end_id_tensors(ids_int32.dtype());
-    end_id_tensors.mutable_data<int>(ids_int32.dims(), place);
-    const auto& runner_fill_end_id =
-        NpuOpRunner("FillD",
-                    {end_id_tmp_tensor},
-                    {end_id_tensors},
-                    {{"dims", phi::vectorize(ids_int32.dims())}});
-    runner_fill_end_id.Run(stream);
-
-    // whether expand_pre_ids == end_ids?
-    Tensor equal_end_ids(phi::DataType::BOOL);
-    equal_end_ids.mutable_data<bool>(ids_int32.dims(), place);
-    const auto& runner_equal_end_ids = NpuOpRunner(
-        "Equal", {expand_pre_ids, end_id_tensors}, {equal_end_ids}, {});
-    runner_equal_end_ids.Run(stream);
-
-    // construct a Tensor with dimension ids->dims():
-    // [[False, True, True, True, ...],
-    //  [False, True, True, True, ...],
-    //  ...]
-    Tensor false_tmp_tensor(phi::DataType::INT32);
-    false_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&false_tmp_tensor, static_cast<int>(false));
-
-    Tensor first_pos_false_tensors(phi::DataType::INT32);
-    first_pos_false_tensors.Resize(phi::make_ddim({batch_size, 1}));
-    first_pos_false_tensors.mutable_data<int>(place);
-    std::vector<int64_t> fill_dims = {batch_size, 1};
-    framework::NPUAttributeMap fill_attr = {{"dims", fill_dims}};
-    const auto& runner_fill_false_tensors = NpuOpRunner(
-        "FillD", {false_tmp_tensor}, {first_pos_false_tensors}, fill_attr);
-    runner_fill_false_tensors.Run(stream);
-
-    Tensor pos_tensors(phi::DataType::INT32);
-    if (seq_width > 1) {
-      pos_tensors.Resize(phi::make_ddim({batch_size, seq_width}));
-      pos_tensors.mutable_data<int>(place);
-
-      Tensor true_tmp_tensor(phi::DataType::INT32);
-      true_tmp_tensor.mutable_data<int>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<int>(&true_tmp_tensor, static_cast<int>(true));
-
-      Tensor second_pos_true_tensors(phi::DataType::INT32);
-      second_pos_true_tensors.Resize(
-          phi::make_ddim({batch_size, seq_width - 1}));
-      second_pos_true_tensors.mutable_data<int>(place);
-      std::vector<int64_t> fill_dims2 = {batch_size, seq_width - 1};
-      framework::NPUAttributeMap fill_attr2 = {{"dims", fill_dims2}};
-      const auto& runner_fill_true_tensors = NpuOpRunner(
-          "FillD", {true_tmp_tensor}, {second_pos_true_tensors}, fill_attr2);
-      runner_fill_true_tensors.Run(stream);
-
-      std::vector<phi::DenseTensor> concat_inputs = {first_pos_false_tensors,
-                                                     second_pos_true_tensors};
-      std::vector<std::string> concat_names = {"x0", "x1"};
-      NpuOpRunner runner_concat_false_true{"ConcatD",
-                                           {concat_inputs},
-                                           {pos_tensors},
-                                           {{"concat_dim", 1}, {"N", 2}}};
-      runner_concat_false_true.AddInputNames(concat_names);
-      runner_concat_false_true.Run(stream);
-      pos_tensors.Resize(ids_int32.dims());
-    } else {
-      pos_tensors.ShareDataWith(first_pos_false_tensors);
-    }
-
-    Tensor cast_pos_tensors_bool(phi::DataType::BOOL);
-    cast_pos_tensors_bool.Resize(pos_tensors.dims());
-    cast_pos_tensors_bool.mutable_data<bool>(ctx.GetPlace());
-    auto dst_dtype = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_pos_tensors_bool.type()));
-    const auto& runner_cast_pos_tensors =
-        NpuOpRunner("Cast",
-                    {pos_tensors},
-                    {cast_pos_tensors_bool},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_pos_tensors.Run(stream);
-
-    // if pre_ids == end_ids, save only one score, and others become -inf
-    // construct pre_ids == end_ids and save only one score
-    Tensor save_one_end_score(phi::DataType::BOOL);
-    save_one_end_score.mutable_data<bool>(ids_int32.dims(), place);
-    const auto& runner_logical_and =
-        NpuOpRunner("LogicalAnd",
-                    {equal_end_ids, cast_pos_tensors_bool},
-                    {save_one_end_score},
-                    {});
-    runner_logical_and.Run(stream);
-
-    // if save_one_end_score is True, set score to -inf
-    // define -Inf Tensors
-    Tensor ninf_tmp_tensor(scores->dtype());
-    ninf_tmp_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float ninf_value =
-        static_cast<float>(-std::numeric_limits<float>::infinity());
-    FillNpuTensorWithConstant<float>(&ninf_tmp_tensor, ninf_value);
-
-    Tensor ninf_tensors(scores->dtype());
-    ninf_tensors.mutable_data<float>(scores->dims(), place);
-    const auto& runner_fill_ninf =
-        NpuOpRunner("FillD",
-                    {ninf_tmp_tensor},
-                    {ninf_tensors},
-                    {{"dims", phi::vectorize(scores->dims())}});
-    runner_fill_ninf.Run(stream);
-
-    // Step2: calculate topk scores
-
-    // get scores used in topk op
-    Tensor tmp_scores(scores->dtype());
-    tmp_scores.mutable_data<float>(scores->dims(), place);
-    if (!is_accumulated) {
-      // if pre_id == end_id, cal_scores = pre_score, and id = end_id
-      // else, cal_score = pre_score + log(score)
-
-      // calculate log(scores)
-      Tensor log_scores(scores->dtype());
-      log_scores.mutable_data<float>(scores->dims(), place);
-
-      Tensor one(scores->dtype());
-      one.mutable_data<float>(scores->dims(), place);
-      const auto& runner_one = NpuOpRunner("OnesLike", {*scores}, {one}, {});
-      runner_one.Run(stream);
-
-      Tensor sub(scores->dtype());
-      sub.mutable_data<float>(scores->dims(), place);
-      const auto& runner_sub = NpuOpRunner("Sub", {*scores, one}, {sub}, {});
-      runner_sub.Run(stream);
-
-      const auto& runner_log_scores =
-          NpuOpRunner("Log1p", {sub}, {log_scores}, {});
-      runner_log_scores.Run(stream);
-
-      // tmp_scores = pre_score + log(scores)
-      const auto& runner_add_scores =
-          NpuOpRunner("Add", {log_scores, *pre_scores}, {tmp_scores}, {});
-      runner_add_scores.Run(stream);
-
-      // if pre_ids == end_ids, use pre_score rather than score
-      const auto& runner_select_equal_end_score =
-          NpuOpRunner("Select",
-                      {equal_end_ids, expand_pre_scores, tmp_scores},
-                      {tmp_scores},
-                      {});
-      runner_select_equal_end_score.Run(stream);
-    } else {
-      // if pre_ids == end_ids, use pre_score rather than score
-      const auto& runner_select_equal_end_score2 =
-          NpuOpRunner("Select",
-                      {equal_end_ids, expand_pre_scores, *scores},
-                      {tmp_scores},
-                      {});
-      runner_select_equal_end_score2.Run(stream);
-    }
-
-    // if pre_ids == end_ids, save only one score, and others become -inf
-    Tensor cal_scores(scores->dtype());
-    cal_scores.mutable_data<float>(scores->dims(), place);
-    const auto& runner_select_inf_score =
-        NpuOpRunner("Select",
-                    {save_one_end_score, ninf_tensors, tmp_scores},
-                    {cal_scores},
-                    {});
-    runner_select_inf_score.Run(stream);
-
-    // resize scores from [num_seqs * beam_size, K] to [num_seqs, beam_size * K]
-    // real_beam_size = 1 or beam_size
-    cal_scores.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width}));
-
-    Tensor topk_scores(scores->dtype());
-    topk_scores.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    topk_scores.mutable_data<float>(ctx.GetPlace());
-
-    Tensor tmp_indices(phi::DataType::INT32);
-    tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    tmp_indices.mutable_data<int>(ctx.GetPlace());
-
-    // run topk op
-    NpuOpRunner runner_topk;
-    runner_topk.SetType("TopKV2")
-        .AddInput(cal_scores)
-        .AddInput(std::vector<int>{static_cast<int>(beam_size)})
-        .AddOutput(topk_scores)
-        .AddOutput(tmp_indices)
-        .AddAttr("sorted", true)
-        .AddAttr("dim", -1)
-        .AddAttr("largest", true);
-    runner_topk.Run(stream);
-
-    // cast tmp_indices from int to float32 for Sort op
-    Tensor cast_tmp_indices(phi::DataType::FLOAT32);
-    cast_tmp_indices.Resize(tmp_indices.dims());
-    cast_tmp_indices.mutable_data<float>(ctx.GetPlace());
-    auto dst_dtype_tmp_indices_fp32 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_tmp_indices.type()));
-    const auto& runner_cast_tmp_indices = NpuOpRunner(
-        "Cast",
-        {tmp_indices},
-        {cast_tmp_indices},
-        {{"dst_type", static_cast<int>(dst_dtype_tmp_indices_fp32)}});
-    runner_cast_tmp_indices.Run(stream);
-
-    // sort tmp_indices
-    Tensor sorted_tmp_indices(phi::DataType::FLOAT32);
-    sorted_tmp_indices.Resize(tmp_indices.dims());
-    sorted_tmp_indices.mutable_data<float>(ctx.GetPlace());
-    Tensor sorted_score_indices(phi::DataType::INT32);
-    sorted_score_indices.Resize(tmp_indices.dims());
-    sorted_score_indices.mutable_data<int>(ctx.GetPlace());
-    const auto& runner_sort_tmp_indices =
-        NpuOpRunner("Sort",
-                    {cast_tmp_indices},
-                    {sorted_tmp_indices, sorted_score_indices},
-                    {{"axis", 1}, {"descending", false}});
-    runner_sort_tmp_indices.Run(stream);
-
-    // cast sorted_tmp_indices from float32 to int
-    Tensor cast_sort_tmp_indices(phi::DataType::INT32);
-    cast_sort_tmp_indices.Resize(sorted_tmp_indices.dims());
-    cast_sort_tmp_indices.mutable_data<int>(ctx.GetPlace());
-    auto dst_dtype_tmp_indices_int32 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_sort_tmp_indices.type()));
-    const auto& runner_cast_sort_tmp_indices = NpuOpRunner(
-        "Cast",
-        {sorted_tmp_indices},
-        {cast_sort_tmp_indices},
-        {{"dst_type", static_cast<int>(dst_dtype_tmp_indices_int32)}});
-    runner_cast_sort_tmp_indices.Run(stream);
-
-    // Step 3: infer selected ids from tmp_indices and ids
-
-    // if pre_ids == end_ids, use pre_ids rather than ids
-    Tensor cal_ids(ids_int32.dtype());
-    cal_ids.mutable_data<int>(ids_int32.dims(), place);
-    const auto& runner_select_equal_end_id = NpuOpRunner(
-        "Select", {equal_end_ids, expand_pre_ids, ids_int32}, {cal_ids}, {});
-    runner_select_equal_end_id.Run(stream);
-
-    // resize ids from [num_seqs * real_beam_size, K] to [num_seqs,
-    // real_beam_size * K]
-    // real_beam_size = 1 or beam_size
-    cal_ids.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width}));
-
-    // construct batch_ids like [[0, 0, 0], [1, 1, 1], ..., [bs-1, bs-1, bs-1]]
-    // construct arange(num_seqs*beam_size).reshape((num_seqs, beam_size)) //
-    // beam_size
-    Tensor batch_ids(phi::DataType::INT32);
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    batch_ids.mutable_data<int>(place);
-
-    std::vector<int> vector_batch_ids;
-    for (int i = 0; i < num_seqs * static_cast<int>(beam_size); ++i) {
-      vector_batch_ids.push_back(static_cast<int>(i / beam_size));
-    }
-    framework::TensorFromVector(vector_batch_ids, ctx, &batch_ids);
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-
-    // sort topk_scores to get selected_scores
-    // get indices of gather_nd op for calculating selected_scores
-    Tensor gather_nd_score_indices(phi::DataType::INT32);
-    gather_nd_score_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 2}));
-    gather_nd_score_indices.mutable_data<int>(place);
-
-    sorted_score_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    std::vector<phi::DenseTensor> concat_inputs2 = {batch_ids,
-                                                    sorted_score_indices};
-    std::vector<std::string> concat_names = {"x0", "x1"};
-    NpuOpRunner runner_concat_score_indices{"ConcatD",
-                                            {concat_inputs2},
-                                            {gather_nd_score_indices},
-                                            {{"concat_dim", 2}, {"N", 2}}};
-    runner_concat_score_indices.AddInputNames(concat_names);
-    runner_concat_score_indices.Run(stream);
-
-    // use gather_nd to get selected_scores
-    const auto& runner_gather_nd_scores =
-        NpuOpRunner("GatherNd",
-                    {topk_scores, gather_nd_score_indices},
-                    {*selected_scores},
-                    {});
-    runner_gather_nd_scores.Run(stream);
-
-    // get indices of gather_nd op
-    cast_sort_tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 1}));
-    Tensor gather_nd_id_indices(phi::DataType::INT32);
-    gather_nd_id_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size), 2}));
-    gather_nd_id_indices.mutable_data<int>(place);
-
-    std::vector<phi::DenseTensor> concat_inputs3 = {batch_ids,
-                                                    cast_sort_tmp_indices};
-    NpuOpRunner runner_concat_id_indices{"ConcatD",
-                                         {concat_inputs3},
-                                         {gather_nd_id_indices},
-                                         {{"concat_dim", 2}, {"N", 2}}};
-    runner_concat_id_indices.AddInputNames(concat_names);
-    runner_concat_id_indices.Run(stream);
-
-    // use gather_nd to get selected_ids
-    Tensor topk_ids(phi::DataType::INT32);
-    topk_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    topk_ids.mutable_data<int>(ctx.GetPlace());
-
-    const auto& runner_gather_nd_ids = NpuOpRunner(
-        "GatherNd", {cal_ids, gather_nd_id_indices}, {topk_ids}, {});
-    runner_gather_nd_ids.Run(stream);
-
-    // cast topk_ids from int to int64 to get selected_ids
-    auto dst_dtype_selected_ids =
-        ConvertToNpuDtype(framework::TransToProtoVarType(selected_ids->type()));
-    const auto& runner_cast_selected_ids =
-        NpuOpRunner("Cast",
-                    {topk_ids},
-                    {*selected_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype_selected_ids)}});
-    runner_cast_selected_ids.Run(stream);
-
-    // TODO(pangyoki): PruneEndBeams
-
-    // Step 4: set lod of output Tensor
-    // define Tensor with value `seq_width`
-    Tensor seq_width_tensor(phi::DataType::INT32);
-    seq_width_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(&seq_width_tensor,
-                                   static_cast<int>(seq_width));
-
-    // beam_ids = tmp_indices // seq_width
-    Tensor beam_ids(phi::DataType::INT32);
-    beam_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-    beam_ids.mutable_data<int>(ctx.GetPlace());
-    cast_sort_tmp_indices.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-
-    const auto& runner_div = NpuOpRunner(
-        "Div", {cast_sort_tmp_indices, seq_width_tensor}, {beam_ids}, {});
-    runner_div.Run(stream);
-
-    // get parent_idx by adding batch_ids to beam_ids
-    // construct scale_batch_ids like [[0, 0, 0], [bw, bw, bw], ..., [bs-1*bw,
-    // bs-1*bw, bs-1*bw]]
-    batch_ids.Resize(
-        phi::make_ddim({num_seqs, static_cast<int64_t>(beam_size)}));
-
-    // cast batch_ids from int to float32
-    Tensor cast_batch_ids(phi::DataType::FLOAT32);
-    cast_batch_ids.Resize(batch_ids.dims());
-    cast_batch_ids.mutable_data<float>(ctx.GetPlace());
-    auto dst_dtype1 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_batch_ids.type()));
-    const auto& runner_cast_batch_ids =
-        NpuOpRunner("Cast",
-                    {batch_ids},
-                    {cast_batch_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype1)}});
-    runner_cast_batch_ids.Run(stream);
-
-    // scale batch_ids with beam_size
-    Tensor scale_batch_ids(phi::DataType::FLOAT32);
-    scale_batch_ids.Resize(batch_ids.dims());
-    scale_batch_ids.mutable_data<float>(place);
-    const auto& runner_power =
-        NpuOpRunner("Power",
-                    {cast_batch_ids},
-                    {scale_batch_ids},
-                    {{"power", static_cast<float>(1.0)},
-                     {"scale", static_cast<float>(beam_size)},
-                     {"shift", static_cast<float>(0.0)}});
-    runner_power.Run(stream);
-
-    // cast cast_scale_batch_ids from float32 to int
-    Tensor cast_scale_batch_ids(phi::DataType::INT32);
-    cast_scale_batch_ids.Resize(scale_batch_ids.dims());
-    cast_scale_batch_ids.mutable_data<int>(ctx.GetPlace());
-    auto dst_dtype2 = ConvertToNpuDtype(
-        framework::TransToProtoVarType(cast_scale_batch_ids.type()));
-    const auto& runner_cast_scale_batch_ids =
-        NpuOpRunner("Cast",
-                    {scale_batch_ids},
-                    {cast_scale_batch_ids},
-                    {{"dst_type", static_cast<int>(dst_dtype2)}});
-    runner_cast_scale_batch_ids.Run(stream);
-
-    // calculate parent_idx
-    Tensor tmp_parent_idx(phi::DataType::INT32);
-    tmp_parent_idx.Resize(parent_idx->dims());
-    tmp_parent_idx.mutable_data<int>(place);
-    const auto& runner_add_beam_id = NpuOpRunner(
-        "Add", {beam_ids, cast_scale_batch_ids}, {tmp_parent_idx}, {});
-    runner_add_beam_id.Run(stream);
-
-    // cast tmp_parent_idx from int to int64 to get parent_idx
-    auto dst_dtype_parent_idx =
-        ConvertToNpuDtype(framework::TransToProtoVarType(parent_idx->type()));
-    const auto& runner_cast_parent_idx =
-        NpuOpRunner("Cast",
-                    {tmp_parent_idx},
-                    {*parent_idx},
-                    {{"dst_type", static_cast<int>(dst_dtype_parent_idx)}});
-    runner_cast_parent_idx.Run(stream);
-
-    std::vector<int> vector_parent_idx;
-    framework::TensorToVector(tmp_parent_idx, ctx, &vector_parent_idx);
-
-    // set low level, len(low_level) = high_level[-1]
-    std::vector<int> low_level;
-    std::vector<int> num_parent_ids(num_seqs * beam_size,
-                                    static_cast<int64_t>(0));
-    size_t low_level_size = high_level[num_seqs];
-    size_t sum_parent_id = 0;
-
-    // calculate number of every parent_id
-    for (size_t i = 0; i < num_seqs * beam_size; ++i) {
-      num_parent_ids[vector_parent_idx[i]]++;
-    }
-
-    // update low_level
-    low_level.push_back(0);
-    for (size_t i = 0; i < low_level_size; ++i) {
-      sum_parent_id += num_parent_ids[i];
-      low_level.push_back(sum_parent_id);
-    }
-
-    // fill lod
-    framework::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "lod %s is not right in"
-          " beam_search, please check your code.",
-          framework::LoDToString(lod)));
-    }
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-};
-
-template class BeamSearchFunctor<platform::NPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::NPUDeviceContext, double>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
deleted file mode 100644
index 094f39366ab35..0000000000000
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AccuracyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* inference = ctx.Input<phi::DenseTensor>("Out");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-
-    auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
-    auto* correct = ctx.Output<phi::DenseTensor>("Correct");
-    auto* total = ctx.Output<phi::DenseTensor>("Total");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    int num_samples = inference->dims()[0];
-    if (num_samples == 0) {
-      return;
-    }
-
-    // cast `indices` or `label` if their type is not consistent
-    Tensor cast_indices(phi::DataType::INT32);
-    Tensor cast_label(phi::DataType::INT32);
-    if (indices->dtype() != label->dtype()) {
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-      if (framework::TransToProtoVarType(indices->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_indices.Resize(indices->dims());
-        cast_indices.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_indices =
-            NpuOpRunner("Cast",
-                        {*indices},
-                        {cast_indices},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_indices.Run(stream);
-      } else {
-        cast_indices.ShareDataWith(*indices);
-      }
-      if (framework::TransToProtoVarType(label->dtype()) !=
-          framework::proto::VarType::INT32) {
-        cast_label.Resize(label->dims());
-        cast_label.mutable_data<int>(ctx.GetPlace());
-        const auto& runner_cast_label =
-            NpuOpRunner("Cast",
-                        {*label},
-                        {cast_label},
-                        {{"dst_type", static_cast<int>(dst_dtype)}});
-        runner_cast_label.Run(stream);
-      } else {
-        cast_label.ShareDataWith(*label);
-      }
-    } else {
-      cast_indices.ShareDataWith(*indices);
-      cast_label.ShareDataWith(*label);
-    }
-
-    // equal
-    Tensor tmp_equal(phi::DataType::BOOL);
-    tmp_equal.Resize(inference->dims());
-    tmp_equal.mutable_data<bool>(ctx.GetPlace());
-    const auto& runner_equal =
-        NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
-    runner_equal.Run(stream);
-
-    // cast equal
-    Tensor tmp_equal_cast(phi::DataType::FLOAT32);
-    tmp_equal_cast.Resize(inference->dims());
-    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_cast_equal = NpuOpRunner(
-        "Cast",
-        {tmp_equal},
-        {tmp_equal_cast},
-        {{"dst_type",
-          static_cast<int>(ConvertToNpuDtype(
-              framework::TransToProtoVarType(tmp_equal_cast.dtype())))}});
-    runner_cast_equal.Run(stream);
-
-    // [correct]
-    // reduce_max
-    Tensor tmp_correct_max(phi::DataType::FLOAT32);
-    tmp_correct_max.Resize(phi::make_ddim({num_samples}));
-    tmp_correct_max.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_max =
-        NpuOpRunner("ReduceMaxD",
-                    {tmp_equal_cast},
-                    {tmp_correct_max},
-                    {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
-    runner_reduce_max.Run(stream);
-
-    // reduce_sum
-    Tensor tmp_correct(phi::DataType::FLOAT32);
-    tmp_correct.Resize(correct->dims());
-    tmp_correct.mutable_data<float>(ctx.GetPlace());
-    const auto& runner_reduce_sum =
-        NpuOpRunner("ReduceSumD",
-                    {tmp_correct_max},
-                    {tmp_correct},
-                    {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
-    runner_reduce_sum.Run(stream);
-
-    // cast to int
-    correct->mutable_data<int>(ctx.GetPlace());
-    const auto& runner_cast_correct =
-        NpuOpRunner("Cast",
-                    {tmp_correct},
-                    {*correct},
-                    {{"dst_type",
-                      static_cast<int>(ConvertToNpuDtype(
-                          framework::TransToProtoVarType(correct->dtype())))}});
-    runner_cast_correct.Run(stream);
-
-    // [total]
-    total->mutable_data<int>(ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
-
-    // use `total` of type `float32` for calculating accuracy
-    Tensor tmp_total(phi::DataType::FLOAT32);
-    tmp_total.Resize(total->dims());
-    tmp_total.mutable_data<float>(ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&tmp_total,
-                                     static_cast<float>(num_samples));
-
-    // [accuracy]
-    accuracy->mutable_data<float>(ctx.GetPlace());
-    const auto& runner_accuracy =
-        NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
-    runner_accuracy.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    accuracy,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
deleted file mode 100644
index 3324e56b3b95f..0000000000000
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AdamNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* param = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Grad(%s)'s type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(param_var->Type())));
-    auto* grad = ctx.Input<phi::DenseTensor>("Grad");
-    auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
-    auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
-    auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-    auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
-    auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
-
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
-    auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
-    auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
-    auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
-
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    // skip_update=true, just copy input to output, and TensorCopy will call
-    // mutable_data
-    if (skip_update) {
-      VLOG(4) << "Adam skip update";
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-      framework::TensorCopy(
-          *beta1_pow,
-          beta1_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta1_pow_out);
-      framework::TensorCopy(
-          *beta2_pow,
-          beta2_pow->place(),
-          ctx.template device_context<platform::DeviceContext>(),
-          beta2_pow_out);
-      return;
-    }
-
-    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
-    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    mom1_out->mutable_data<T>(ctx.GetPlace());
-    mom2_out->mutable_data<T>(ctx.GetPlace());
-
-    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
-    // place.
-    phi::DenseTensor beta1_pow_tmp;
-    phi::DenseTensor beta2_pow_tmp;
-    if (beta1_pow->place() == platform::CPUPlace()) {
-      T beta1 = *beta1_pow->data<T>();
-      beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
-      beta1_pow = &beta1_pow_tmp;
-    }
-    if (beta2_pow->place() == platform::CPUPlace()) {
-      T beta2 = *beta2_pow->data<T>();
-      beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
-      beta2_pow = &beta2_pow_tmp;
-    }
-
-    const phi::DenseTensor* beta1_tensor = nullptr;
-    const phi::DenseTensor* beta2_tensor = nullptr;
-    const phi::DenseTensor* epsilon_tensor = nullptr;
-
-    phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
-    phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-
-    if (ctx.HasInput("Beta1Tensor")) {
-      beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
-      PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta1Tensor) size must be 1, but get %d",
-                            beta1_tensor->numel()));
-    } else {
-      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
-      beta1_tensor = &beta1_tmp;
-    }
-
-    if (ctx.HasInput("Beta2Tensor")) {
-      beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(Beta2Tensor) size must be 1, but get %d",
-                            beta2_tensor->numel()));
-    } else {
-      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
-      beta2_tensor = &beta2_tmp;
-    }
-
-    if (ctx.HasInput("EpsilonTensor")) {
-      epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
-      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(EpsilonTensor) size must be 1, but get %d",
-                            epsilon_tensor->numel()));
-    } else {
-      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-      epsilon_tensor = &epsilon_tmp;
-    }
-
-    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
-            << "beta2_pow.numel() : " << beta2_pow->numel();
-    VLOG(3) << "param.numel(): " << param->numel();
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ApplyAdamD",
-                                     {
-                                         *param,
-                                         *mom1,
-                                         *mom2,
-                                         *beta1_pow,
-                                         *beta2_pow,
-                                         *lr,
-                                         *beta1_tensor,
-                                         *beta2_tensor,
-                                         *epsilon_tensor,
-                                         *grad,
-                                     },
-                                     {
-                                         *param_out,
-                                         *mom1_out,
-                                         *mom2_out,
-                                     },
-                                     {});
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param->data<T>()) {
-      framework::TensorCopy(
-          *param,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-    if (mom1_out->data<T>() != mom1->data<T>()) {
-      framework::TensorCopy(
-          *mom1,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom1_out);
-    }
-    if (mom2_out->data<T>() != mom2->data<T>()) {
-      framework::TensorCopy(
-          *mom2,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          mom2_out);
-    }
-    if (!use_global_beta_pow) {
-      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner_m1 =
-          NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
-      runner_m1.Run(stream);
-      const auto& runner_m2 =
-          NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
-      runner_m2.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(3) << "NPU AdamW Kernel";
-    bool skip_update = false;
-    if (ctx.HasInput("SkipUpdate")) {
-      VLOG(3) << "Has SkipUpdate";
-      auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
-      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Input(SkipUpdate) size must be 1, but get %d",
-                            skip_update_tensor->numel()));
-      std::vector<bool> skip_update_vec;
-      paddle::framework::TensorToVector(
-          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
-      skip_update = skip_update_vec[0];
-    }
-    VLOG(3) << "Skip update" << skip_update;
-    bool with_decay = ctx.Attr<bool>("with_decay");
-    if (!skip_update && with_decay) {
-      float coeff = ctx.Attr<float>("coeff");
-      auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
-
-      auto place = ctx.GetPlace();
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-
-      phi::DenseTensor one(phi::DataType::FLOAT32);
-      phi::DenseTensor decay(phi::DataType::FLOAT32);
-      phi::DenseTensor tmp(phi::DataType::FLOAT32);
-
-      tmp.mutable_data<float>({1}, place);
-      one.mutable_data<float>({1}, place);
-      decay.mutable_data<float>({1}, place);
-
-      FillNpuTensorWithConstant<float>(&one, 1.0f);
-      framework::NPUAttributeMap attr_input = {{"value", coeff}};
-
-      const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input);
-      runner1.Run(stream);
-
-      const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {});
-      runner2.Run(stream);
-
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Parma is not supported on npu"));
-      } else {
-        auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-        param_out->mutable_data<T>(ctx.GetPlace());
-
-        const auto* param_var = ctx.InputVar("Param");
-        PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "The Var(%s)'s type should be phi::DenseTensor, "
-                              "but the received is %s",
-                              ctx.InputNames("Param").front(),
-                              framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<phi::DenseTensor>("Param");
-
-        const auto& runner =
-            NpuOpRunner("Mul",
-                        {*param, decay},
-                        {*const_cast<phi::DenseTensor*>(param)},
-                        {});
-        runner.Run(stream);
-      }
-    }
-    AdamNPUKernel<platform::NPUDeviceContext, T>::Compute(ctx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    adam,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(adamw,
-                       ops::AdamWNPUKernel<float>,
-                       ops::AdamWNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
deleted file mode 100644
index 83c805a1f642a..0000000000000
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/macros.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto params = ctx.MultiInput<phi::DenseTensor>("Param");
-    auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
-    size_t n = params.size();
-    PADDLE_ENFORCE_EQ(n,
-                      params_out.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Output(ParamOut) must be equal to "
-                          "Input(Param), but got the size of Output(ParamOut) "
-                          "is %d, the size of Input(Param) is %d.",
-                          params_out.size(),
-                          n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(params[i],
-                        params_out[i],
-                        platform::errors::InvalidArgument(
-                            "The size of Input(Param) and Output(ParamOut) "
-                            "must be the same Tensors."));
-    }
-
-    auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
-    PADDLE_ENFORCE_EQ(
-        n,
-        grads.size(),
-        platform::errors::InvalidArgument(
-            "The size of Input(Grad) must be equal to Input(Param), but got "
-            "the size of Input(Grad) is %d, the size of Input(Param) is %d.",
-            grads.size(),
-            n));
-
-    auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
-    PADDLE_ENFORCE_EQ(n,
-                      velocitys.size(),
-                      platform::errors::InvalidArgument(
-                          "The size of Input(Velocity) must be equal to "
-                          "Input(Param), but got the size of Input(Velocity) "
-                          "is %d, the size of Input(Param) is %d.",
-                          velocitys.size(),
-                          n));
-
-    auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
-    PADDLE_ENFORCE_EQ(
-        n,
-        velocitys_out.size(),
-        platform::errors::InvalidArgument(
-            "The size of Output(VelocityOut) must be "
-            "equal to Input(Param), but got the size of Output(VelocityOut) is "
-            "%d, the size of Input(Param) is %d.",
-            velocitys_out.size(),
-            n));
-    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_EQ(velocitys[i],
-                        velocitys_out[i],
-                        platform::errors::InvalidArgument(
-                            "Input(Velocity) and Output(VelocityOut) must be "
-                            "the same Tensors."));
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
-    if (lrs.size() != 1) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          lrs.size(),
-          platform::errors::InvalidArgument(
-              "If the size of Input(LearningRate) is not 1, the size of "
-              "Input(LearningRate) must be "
-              "equal to Input(Param), but got the size of Input(LearningRate) "
-              "is %d, the size of Input(Param) is %d.",
-              lrs.size(),
-              n));
-    }
-    auto use_nesterov = ctx.Attr<bool>("use_nesterov");
-    auto regularization_methods =
-        ctx.Attr<std::vector<std::string>>("regularization_method");
-    auto regularization_coeffs =
-        ctx.Attr<std::vector<float>>("regularization_coeff");
-    if (regularization_methods.size() != 0) {
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_methods.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_method) must be equal "
-              "to Input(Param), but got the size of "
-              "Attr(regularization_method) is %d, the size of Input(Param) is "
-              "%d.",
-              regularization_methods.size(),
-              n));
-      PADDLE_ENFORCE_EQ(
-          n,
-          regularization_coeffs.size(),
-          platform::errors::InvalidArgument(
-              "The size of Attr(regularization_coeff) must be equal "
-              "to Input(Param), but got the size of Attr(regularization_coeff) "
-              "is %d, the size of Input(Param) is %d.",
-              regularization_coeffs.size(),
-              n));
-    }
-
-    VLOG(5) << "use_nesterov: " << use_nesterov
-            << ",  regularization_methods.size(): "
-            << regularization_methods.size()
-            << ",  regularization_coeffs.size(): "
-            << regularization_coeffs.size();
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    Tensor mu_tensor;
-    mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-    for (size_t idx = 0; idx < n; ++idx) {
-      phi::RegularizationType regularization_flag =
-          regularization_methods.size() > 0 &&
-                  regularization_methods[idx] == "l2_decay"
-              ? phi::RegularizationType::kL2DECAY
-              : phi::RegularizationType::kNONE;
-      float regularization_coeff = 0.0;
-      if (regularization_coeffs.size() != 0) {
-        regularization_coeff = regularization_coeffs[idx];
-      }
-
-      auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
-      auto param = params[idx];
-      auto param_out = params_out[idx];
-      auto velocity = velocitys[idx];
-      auto velocity_out = velocitys_out[idx];
-
-      auto grad = grads[idx];
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(merged_momentum,
-                       ops::NPUMergedMomentumOpKernel<float>,
-                       ops::NPUMergedMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc
deleted file mode 100644
index a5349e05b9b02..0000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUMomentumOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-
-    std::string regularization_method =
-        ctx.Attr<std::string>("regularization_method");
-    auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    phi::RegularizationType regularization_flag{
-        phi::RegularizationType::kNONE};  // disable regularization
-    if (regularization_method == "l2_decay") {
-      regularization_flag = phi::RegularizationType::kL2DECAY;
-    }
-
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
-
-    auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto param = ctx.Input<phi::DenseTensor>("Param");
-    auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
-
-    auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    velocity_out->mutable_data<T>(ctx.GetPlace());
-
-    auto* grad_var = ctx.InputVar("Grad");
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      Tensor mu_tensor;
-      mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&mu_tensor, mu);
-
-      Tensor regularized_grad;
-      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
-        regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
-        const auto& runner1 = NpuOpRunner("Muls",
-                                          {*param},
-                                          {regularized_grad},
-                                          {{"value", regularization_coeff}});
-        runner1.Run(dev_ctx.stream());
-        const auto& runner2 = NpuOpRunner(
-            "Add", {regularized_grad, *grad}, {regularized_grad}, {});
-        runner2.Run(dev_ctx.stream());
-      } else {
-        regularized_grad.ShareDataWith(*grad);
-      }
-      framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
-      framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
-      // NOTE: ApplyMomentum will change the input
-      const auto& runner = NpuOpRunner("ApplyMomentum",
-                                       {*param_out,
-                                        *velocity_out,
-                                        *learning_rate,
-                                        regularized_grad,
-                                        mu_tensor},
-                                       {*param_out},
-                                       {{"use_nesterov", use_nesterov}});
-      runner.Run(dev_ctx.stream());
-    } else if (grad_var->IsType<phi::SelectedRows>()) {
-      PADDLE_ENFORCE_EQ(
-          false,
-          true,
-          platform::errors::PermissionDenied("Unsupport SparseMomentum"));
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in MomentumOp. Excepted LodTensor "
-                            "or SelectedRows, But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(momentum,
-                       ops::NPUMomentumOpKernel<float>,
-                       ops::NPUMomentumOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
deleted file mode 100644
index 6ee01272f47e8..0000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class RMSPROPNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *grad_var = ctx.InputVar("Grad");
-    auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-    auto *moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
-    auto *mean_square_out = ctx.Output<phi::DenseTensor>("MeanSquareOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-    moment_out->mutable_data<T>(ctx.GetPlace());
-    mean_square_out->mutable_data<T>(ctx.GetPlace());
-
-    auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto rho = static_cast<T>(ctx.Attr<float>("decay"));
-    auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-    auto *p_tensor = ctx.Input<phi::DenseTensor>("Param");
-    auto *ms_tensor = ctx.Input<phi::DenseTensor>("MeanSquare");
-    auto *lr_tensor = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto *mom_tensor = ctx.Input<phi::DenseTensor>("Moment");
-    bool centered = ctx.Attr<bool>("centered");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (grad_var->IsType<phi::DenseTensor>()) {
-      auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
-      if (centered) {
-        framework::NPUAttributeMap attr_input = {{"use_locking", false}};
-        const phi::DenseTensor *rho_tensor = nullptr;
-        const phi::DenseTensor *momentum_tensor = nullptr;
-        const phi::DenseTensor *epsilon_tensor = nullptr;
-        phi::DenseTensor rho_tmp(phi::DataType::FLOAT32);
-        rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&rho_tmp, rho);
-        rho_tensor = &rho_tmp;
-        phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32);
-        momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
-        momentum_tensor = &momentum_tmp;
-        phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
-        epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
-        FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
-        epsilon_tensor = &epsilon_tmp;
-        auto *mg_tensor = ctx.Input<phi::DenseTensor>("MeanGrad");
-        auto *mean_grad_out = ctx.Output<phi::DenseTensor>("MeanGradOut");
-        mean_grad_out->mutable_data<T>(ctx.GetPlace());
-        const auto &runner_applycenterrmsprop = NpuOpRunner(
-            std::string("ApplyCenteredRMSPropD"),
-            {*p_tensor,
-             *mg_tensor,
-             *ms_tensor,
-             *mom_tensor,
-             *lr_tensor,
-             *rho_tensor,
-             *momentum_tensor,
-             *epsilon_tensor,
-             *grad_tensor},
-            {*param_out, *mean_grad_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applycenterrmsprop.Run(stream);
-      } else {
-        framework::NPUAttributeMap attr_input = {
-            {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}};
-        const auto &runner_applyrmsprop = NpuOpRunner(
-            std::string("ApplyRMSPropD"),
-            {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor},
-            {*param_out, *mean_square_out, *moment_out},
-            {attr_input});
-        runner_applyrmsprop.Run(stream);
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false,
-                        true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Grad "
-                            "in RmspropOp. Excepted LodTensor, "
-                            "But received [%s]",
-                            paddle::framework::ToTypeName(grad_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    rmsprop, ops::RMSPROPNPUKernel<paddle::platform::NPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
deleted file mode 100644
index 7bd5cf8793cd0..0000000000000
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SGDNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
-    auto* param_var = ctx.Input<phi::DenseTensor>("Param");
-    auto* grad_var = ctx.Input<phi::DenseTensor>("Grad");
-    auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
-
-    param_out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ApplyGradientDescent",
-                                     {*param_var, *learning_rate, *grad_var},
-                                     {*param_out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
-    // if param and param_out is not same, we need to do copy.
-    if (param_out->data<T>() != param_var->data<T>()) {
-      framework::TensorCopy(
-          *param_var,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          param_out);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    sgd,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
deleted file mode 100644
index 7ec3183d412d4..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceAnyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // set attr
-    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
-
-    const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
deleted file mode 100644
index aec1640181bcc..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(reduce_any);
-USE_OP_DEVICE_KERNEL(reduce_any, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<bool> init_x = {true, false, false, false};
-  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({2}));
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  std::vector<int> axes;
-  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
-  auto op = f::OpRegistry::CreateOp(
-      "reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  std::vector<bool> out_vec;
-  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  std::vector<bool> expected_vec = {true};
-  EXPECT_EQ(out_vec.size(), expected_vec.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], expected_vec[i]);
-  }
-}
-
-TEST(reduce_any, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<bool>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
deleted file mode 100644
index de4049c7e7f97..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Input<phi::DenseTensor>("Out");
-    auto* out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto reduce_dims = context.Attr<std::vector<int>>("dim");
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    int in_dtype = context.Attr<int>("in_dtype");
-
-    PADDLE_ENFORCE_EQ(
-        in_dtype == -1,
-        true,
-        platform::errors::InvalidArgument(
-            "NPU only support in_dtype == -1 in reduce_max_grad op."));
-
-    auto* x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(context.GetPlace());
-
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto place = context.GetPlace();
-    auto stream = dev_ctx.stream();
-
-    // broadcast
-    auto x_dims_vec = phi::vectorize(x->dims());
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-
-    phi::DenseTensor tmp_out, tmp_out_grad;
-    auto tmp_out_dims_vec = x_dims_vec;
-    for (auto d : reduce_dims) {
-      if (d < 0) {
-        d += x_dims_vec.size();
-      }
-      tmp_out_dims_vec[d] = 1;
-    }
-
-    tmp_out.ShareDataWith(*out);
-    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
-    tmp_out_grad.ShareDataWith(*out_grad);
-    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
-
-    phi::DenseTensor transformed_out(x->type());
-    transformed_out.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out.mutable_data<T>(place);
-    NpuOpRunner r_brd_out;
-    r_brd_out.SetType("BroadcastTo")
-        .AddInput(tmp_out)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out)
-        .Run(stream);
-    phi::DenseTensor transformed_out_grad(x->type());
-    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
-    transformed_out_grad.mutable_data<T>(place);
-    NpuOpRunner r_brd_out_grad;
-    r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(tmp_out_grad)
-        .AddInput(std::move(x_dims_vec))
-        .AddOutput(transformed_out_grad)
-        .Run(stream);
-
-    // compare
-    phi::DenseTensor equal_cond;
-    equal_cond.mutable_data<bool>(x_grad->dims(), place);
-    const auto& r_equal =
-        NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
-    r_equal.Run(stream);
-
-    // select
-    phi::DenseTensor t_zero;
-    t_zero.mutable_data<T>(x_grad->dims(), place);
-    FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
-    t_zero.Resize(x_grad->dims());
-
-    const auto& r_sel = NpuOpRunner(
-        "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
-    r_sel.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_max,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_max_grad,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
deleted file mode 100644
index 65fabbd21cb7e..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUReduceMeanOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-
-    auto input_dims = input->dims();
-    if (reduce_all) {
-      dims.clear();
-      for (int i = 0; i < input_dims.size(); i++) {
-        dims.push_back(static_cast<int>(i));
-      }
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.SetType("ReduceMean")
-        .AddInput(*input)
-        .AddInput(std::move(dims))
-        .AddOutput(*output)
-        .AddAttrs({{"keep_dims", keep_dim}})
-        .Run(stream);
-  }
-};
-
-template <typename T>
-class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
-    auto input_dims = input->dims();
-
-    int reduce_numel = 1;
-    if (reduce_all) {
-      reduce_dims.clear();
-      for (int d = 0; d < input_dims.size(); ++d) {
-        reduce_dims.push_back(static_cast<int>(d));
-      }
-    }
-    for (auto& d : reduce_dims) {
-      if (d < 0) {
-        d = d + input_dims.size();
-      }
-      reduce_numel *= input_dims[d];
-    }
-
-    phi::DenseTensor tensor_value(input_grad->dtype());
-    tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(
-        &tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(input_dims))
-        .AddInput(tensor_value)
-        .AddOutput(*input_grad)
-        .Run(stream);
-
-    phi::DenseTensor transformed_input_grad, transformed_out_grad;
-    phi::DenseTensor tmp_output_grad;
-    auto tmp_output_dims = input_dims;
-    for (auto d : reduce_dims) {
-      tmp_output_dims[d] = 1;
-    }
-    tmp_output_grad.ShareDataWith(*output_grad);
-    tmp_output_grad.Resize(tmp_output_dims);
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    NpuElementWiseOpBroadcast<T>(dev_ctx,
-                                 input_grad,
-                                 &tmp_output_grad,
-                                 0,
-                                 &transformed_input_grad,
-                                 &transformed_out_grad);
-    const auto& runner2 =
-        NpuOpRunner("Mul",
-                    {transformed_input_grad, transformed_out_grad},
-                    {*input_grad},
-                    {});
-    runner2.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel<float>);
-REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel<float>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
deleted file mode 100644
index e4adc42283120..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceMinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    if (x->dtype() == phi::DataType::INT64) {
-      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                        const std::vector<phi::DenseTensor>& outputs,
-                        const NPUAttributeMap& attrs,
-                        const platform::NPUDeviceContext& dev_ctx) {
-        const auto& runner =
-            NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs);
-        runner.Run(dev_ctx.stream());
-      };
-
-      NpuOpRunner::TypeAdapter({*x},
-                               {cast_out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-
-    if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_min,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, plat::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
deleted file mode 100644
index fd9bf28b60793..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceProdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-    bool keep_dim = ctx.Attr<bool>("keep_dim");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor cast_out(x->type());
-    cast_out.Resize(out->dims());
-    cast_out.mutable_data<T>(place);
-
-    auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
-    if (out_dtype != -1) {
-      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      if (cast_out_dtype == framework::proto::VarType::FP32) {
-        out->mutable_data<float>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
-        out->mutable_data<paddle::platform::float16>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
-        out->mutable_data<int16_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
-        out->mutable_data<int32_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
-        out->mutable_data<int64_t>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
-        out->mutable_data<double>(place);
-      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
-        out->mutable_data<bool>(place);
-      }
-    } else {
-      out->ShareDataWith(cast_out);
-    }
-
-    framework::NPUAttributeMap attr_input = {{"axes", dims},
-                                             {"keep_dims", keep_dim}};
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input);
-    runner.Run(stream);
-
-    if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
-      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    reduce_prod,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
deleted file mode 100644
index 0c6665494ece7..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/unsqueeze_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReduceSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    // special case
-    if (x->dims().size() == 1 && keep_dims == false) {
-      keep_dims = true;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor cast_x;
-    phi::DenseTensor cast_out;
-    // NOTE: ReduceSumD only supports fp32 and fp16
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      cast_x.Resize(x->dims());
-      cast_x.mutable_data<float>(ctx.GetPlace());
-      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
-      const auto& runner_cast = NpuOpRunner(
-          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-
-      cast_out.Resize(out->dims());
-      cast_out.mutable_data<float>(ctx.GetPlace());
-    } else {
-      cast_x.ShareDataWith(*x);
-      cast_out.ShareDataWith(*out);
-    }
-
-    if (reduce_all) {
-      std::vector<int> dim_vec;
-      for (int i = 0; i < x->dims().size(); i++) {
-        dim_vec.push_back(i);
-      }
-
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dim_vec}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-
-    } else {
-      const auto& runner =
-          NpuOpRunner("ReduceSumD",
-                      {cast_x},
-                      {cast_out},
-                      {{"axes", dims}, {"keep_dims", keep_dims}});
-      runner.Run(stream);
-    }
-
-    if (framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP32 &&
-        framework::TransToProtoVarType(x->dtype()) !=
-            framework::proto::VarType::FP16) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype()));
-      const auto& runner_cast =
-          NpuOpRunner("Cast",
-                      {cast_out},
-                      {*out},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    bool reduce_all = ctx.Attr<bool>("reduce_all");
-    bool keep_dims = ctx.Attr<bool>("keep_dim");
-    auto dims = ctx.Attr<std::vector<int>>("dim");
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (keep_dims || reduce_all) {
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {*out_grad},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    } else {
-      framework::DDim out_dims;
-      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
-          dims, out_grad->dims());
-
-      phi::DenseTensor out_grad_tmp(out_grad->type());
-      out_grad_tmp.Resize(out_dims);
-      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopy(
-          *out_grad,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &out_grad_tmp);
-      out_grad_tmp.Resize(out_dims);
-
-      const auto& runner = NpuOpRunner("BroadcastToD",
-                                       {out_grad_tmp},
-                                       {*x_grad},
-                                       {{"shape", phi::vectorize(x->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    reduce_sum_grad,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
deleted file mode 100644
index 3978923d46af7..0000000000000
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SequenceMaskNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int maxlen = ctx.Attr<int>("maxlen");
-
-    if (ctx.HasInput("MaxLenTensor")) {
-      auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
-      PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
-                              platform::errors::InvalidArgument(
-                                  "Input(MaxLenTensor) should not be NULL."
-                                  "But received Input(MaxLenTensor) is NULL"));
-      phi::DenseTensor temp;
-      paddle::framework::TensorCopySync(
-          *max_len_tensor, platform::CPUPlace(), &temp);
-      maxlen = *temp.data<int32_t>();
-      PADDLE_ENFORCE_GT(
-          maxlen,
-          0,
-          platform::errors::InvalidArgument(
-              "Input(MaxLenTensor) value should be greater than 0. But "
-              "received Input(MaxLenTensor) value = %d.",
-              maxlen));
-    }
-
-    if (maxlen < 0) {
-      auto x_numel = x->numel();
-      if (x_numel == 0) {
-        maxlen = 0;
-      } else {
-        std::vector<T> x_vec;
-        framework::TensorToVector(*x, dev_ctx, &x_vec);
-        auto x_data = x_vec.data();
-        maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
-      }
-    }
-    auto y_dim = phi::vectorize<int>(x->dims());
-    y_dim.push_back(maxlen);
-
-    phi::DenseTensor cast_x;
-    cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
-    const auto& cast1_runner = NpuOpRunner(
-        "Cast",
-        {*x},
-        {cast_x},
-        {{"dst_type",
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}});
-    cast1_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor tmp;
-    tmp.mutable_data<int32_t>(phi::make_ddim({maxlen}), ctx.GetPlace());
-    NpuOpRunner range_runner;
-    range_runner.SetType("Range");
-    range_runner.AddInput(std::vector<int32_t>({0}));
-    range_runner.AddInput(std::vector<int32_t>({maxlen}));
-    range_runner.AddInput(std::vector<int32_t>({1}));
-    range_runner.AddOutput(tmp);
-    range_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor expand_tmp;
-    expand_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& expand_runner =
-        NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
-    expand_runner.Run(dev_ctx.stream());
-
-    auto x_dims = phi::vectorize<int>(x->dims());
-    x_dims.push_back(1);
-    cast_x.Resize(phi::make_ddim({x_dims}));
-    phi::DenseTensor x_tmp;
-    x_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& tile_runner =
-        NpuOpRunner("TileWithAxis",
-                    {cast_x},
-                    {x_tmp},
-                    {{"axis", x->dims().size()}, {"tiles", maxlen}});
-    tile_runner.Run(dev_ctx.stream());
-
-    phi::DenseTensor y_tmp;
-    y_tmp.mutable_data<uint8_t>(phi::make_ddim(y_dim), ctx.GetPlace());
-    const auto& less_runner =
-        NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
-    less_runner.Run(dev_ctx.stream());
-
-    y->Resize(phi::make_ddim(y_dim));
-    auto out_dtype = static_cast<framework::proto::VarType::Type>(
-        ctx.Attr<int>("out_dtype"));
-    if (out_dtype == framework::proto::VarType::INT32) {
-      y->mutable_data<int32_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::INT64) {
-      y->mutable_data<int64_t>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP32) {
-      y->mutable_data<float>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::FP64) {
-      y->mutable_data<double>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::BOOL) {
-      y->mutable_data<bool>(ctx.GetPlace());
-    } else if (out_dtype == framework::proto::VarType::UINT8) {
-      y->mutable_data<uint8_t>(ctx.GetPlace());
-    } else {
-      PADDLE_ENFORCE(false,
-                     platform::errors::InvalidArgument(
-                         "out_dtype only supporing int32, int64, fp32, fp64, "
-                         "bool, uint8, but receive out_dtype is %d",
-                         out_dtype));
-    }
-
-    const auto& cast2_runner = NpuOpRunner(
-        "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}});
-    cast2_runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    sequence_mask,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int32_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, double>);