diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc deleted file mode 100644 index 937cd46d52888..0000000000000 --- a/paddle/fluid/operators/math/beam_search_npu.cc +++ /dev/null @@ -1,588 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/beam_search.h" -#include "paddle/phi/common/data_type.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class NPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class BeamSearchFunctor { - public: - void operator()(const platform::NPUDeviceContext& ctx, - const phi::DenseTensor* pre_ids, - const phi::DenseTensor* pre_scores, - const phi::DenseTensor* ids, - const phi::DenseTensor* scores, - phi::DenseTensor* selected_ids, - phi::DenseTensor* selected_scores, - phi::DenseTensor* parent_idx, - size_t level, - size_t beam_size, - int end_id, - bool is_accumulated) { - auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto& high_level = abs_lod[level]; - - int64_t num_seqs = scores->NumElements(level); - // size of the first beam is 1, others are equal to beam_size - int64_t real_beam_size = static_cast(scores->dims()[0] / num_seqs); - // K - int64_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { - seq_width *= scores->dims()[i]; - } - - auto place = ctx.GetPlace(); - auto stream = ctx.stream(); - - int64_t total_length = num_seqs * beam_size; - int64_t batch_size = static_cast(scores->dims()[0]); - selected_ids->mutable_data(phi::make_ddim({total_length, 1}), - place); - selected_scores->mutable_data(phi::make_ddim({total_length, 1}), - place); - parent_idx->mutable_data(phi::make_ddim({total_length}), place); - - // Step1: Define Tensors and Preprocess the situation that pre_id == end_id - - // cast ids and pre_ids from int to float32 - Tensor ids_int32(phi::DataType::INT32); - if (framework::TransToProtoVarType(ids->dtype()) != - framework::proto::VarType::INT32) { - ids_int32.Resize(ids->dims()); - ids_int32.mutable_data(ctx.GetPlace()); - auto dst_dtype_ids_int32 = - ConvertToNpuDtype(framework::TransToProtoVarType(ids_int32.dtype())); - const auto& runner_ids_int32 = - NpuOpRunner("Cast", - {*ids}, - {ids_int32}, - {{"dst_type", static_cast(dst_dtype_ids_int32)}}); - runner_ids_int32.Run(stream); - } else { - ids_int32.ShareDataWith(*ids); - } - - Tensor pre_ids_int32(phi::DataType::INT32); - if (framework::TransToProtoVarType(pre_ids->dtype()) != - framework::proto::VarType::INT32) { - pre_ids_int32.Resize(pre_ids->dims()); - pre_ids_int32.mutable_data(ctx.GetPlace()); - auto dst_dtype_pre_ids_int32 = ConvertToNpuDtype( - framework::TransToProtoVarType(pre_ids_int32.dtype())); - const auto& runner_pre_ids_int32 = NpuOpRunner( - "Cast", - {*pre_ids}, - {pre_ids_int32}, - {{"dst_type", static_cast(dst_dtype_pre_ids_int32)}}); - runner_pre_ids_int32.Run(stream); - } else { - pre_ids_int32.ShareDataWith(*pre_ids); - } - - Tensor expand_pre_ids(pre_ids_int32.dtype()); - expand_pre_ids.Resize(phi::make_ddim({batch_size, seq_width})); - expand_pre_ids.mutable_data(place); - const auto& runner_tile_pre_ids = - NpuOpRunner("TileWithAxis", - {pre_ids_int32}, - {expand_pre_ids}, - {{"axis", 1}, {"tiles", seq_width}}); - runner_tile_pre_ids.Run(stream); - expand_pre_ids.Resize(ids_int32.dims()); - - Tensor expand_pre_scores(pre_scores->dtype()); - expand_pre_scores.Resize(phi::make_ddim({batch_size, seq_width})); - expand_pre_scores.mutable_data(place); - const auto& runner_tile_pre_scores = - NpuOpRunner("TileWithAxis", - {*pre_scores}, - {expand_pre_scores}, - {{"axis", 1}, {"tiles", seq_width}}); - runner_tile_pre_scores.Run(stream); - expand_pre_scores.Resize(scores->dims()); - - // End_id Tensors - Tensor end_id_tmp_tensor(phi::DataType::INT32); - end_id_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&end_id_tmp_tensor, end_id); - - Tensor end_id_tensors(ids_int32.dtype()); - end_id_tensors.mutable_data(ids_int32.dims(), place); - const auto& runner_fill_end_id = - NpuOpRunner("FillD", - {end_id_tmp_tensor}, - {end_id_tensors}, - {{"dims", phi::vectorize(ids_int32.dims())}}); - runner_fill_end_id.Run(stream); - - // whether expand_pre_ids == end_ids? - Tensor equal_end_ids(phi::DataType::BOOL); - equal_end_ids.mutable_data(ids_int32.dims(), place); - const auto& runner_equal_end_ids = NpuOpRunner( - "Equal", {expand_pre_ids, end_id_tensors}, {equal_end_ids}, {}); - runner_equal_end_ids.Run(stream); - - // construct a Tensor with dimension ids->dims(): - // [[False, True, True, True, ...], - // [False, True, True, True, ...], - // ...] - Tensor false_tmp_tensor(phi::DataType::INT32); - false_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&false_tmp_tensor, static_cast(false)); - - Tensor first_pos_false_tensors(phi::DataType::INT32); - first_pos_false_tensors.Resize(phi::make_ddim({batch_size, 1})); - first_pos_false_tensors.mutable_data(place); - std::vector fill_dims = {batch_size, 1}; - framework::NPUAttributeMap fill_attr = {{"dims", fill_dims}}; - const auto& runner_fill_false_tensors = NpuOpRunner( - "FillD", {false_tmp_tensor}, {first_pos_false_tensors}, fill_attr); - runner_fill_false_tensors.Run(stream); - - Tensor pos_tensors(phi::DataType::INT32); - if (seq_width > 1) { - pos_tensors.Resize(phi::make_ddim({batch_size, seq_width})); - pos_tensors.mutable_data(place); - - Tensor true_tmp_tensor(phi::DataType::INT32); - true_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&true_tmp_tensor, static_cast(true)); - - Tensor second_pos_true_tensors(phi::DataType::INT32); - second_pos_true_tensors.Resize( - phi::make_ddim({batch_size, seq_width - 1})); - second_pos_true_tensors.mutable_data(place); - std::vector fill_dims2 = {batch_size, seq_width - 1}; - framework::NPUAttributeMap fill_attr2 = {{"dims", fill_dims2}}; - const auto& runner_fill_true_tensors = NpuOpRunner( - "FillD", {true_tmp_tensor}, {second_pos_true_tensors}, fill_attr2); - runner_fill_true_tensors.Run(stream); - - std::vector concat_inputs = {first_pos_false_tensors, - second_pos_true_tensors}; - std::vector concat_names = {"x0", "x1"}; - NpuOpRunner runner_concat_false_true{"ConcatD", - {concat_inputs}, - {pos_tensors}, - {{"concat_dim", 1}, {"N", 2}}}; - runner_concat_false_true.AddInputNames(concat_names); - runner_concat_false_true.Run(stream); - pos_tensors.Resize(ids_int32.dims()); - } else { - pos_tensors.ShareDataWith(first_pos_false_tensors); - } - - Tensor cast_pos_tensors_bool(phi::DataType::BOOL); - cast_pos_tensors_bool.Resize(pos_tensors.dims()); - cast_pos_tensors_bool.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_pos_tensors_bool.type())); - const auto& runner_cast_pos_tensors = - NpuOpRunner("Cast", - {pos_tensors}, - {cast_pos_tensors_bool}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_pos_tensors.Run(stream); - - // if pre_ids == end_ids, save only one score, and others become -inf - // construct pre_ids == end_ids and save only one score - Tensor save_one_end_score(phi::DataType::BOOL); - save_one_end_score.mutable_data(ids_int32.dims(), place); - const auto& runner_logical_and = - NpuOpRunner("LogicalAnd", - {equal_end_ids, cast_pos_tensors_bool}, - {save_one_end_score}, - {}); - runner_logical_and.Run(stream); - - // if save_one_end_score is True, set score to -inf - // define -Inf Tensors - Tensor ninf_tmp_tensor(scores->dtype()); - ninf_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - float ninf_value = - static_cast(-std::numeric_limits::infinity()); - FillNpuTensorWithConstant(&ninf_tmp_tensor, ninf_value); - - Tensor ninf_tensors(scores->dtype()); - ninf_tensors.mutable_data(scores->dims(), place); - const auto& runner_fill_ninf = - NpuOpRunner("FillD", - {ninf_tmp_tensor}, - {ninf_tensors}, - {{"dims", phi::vectorize(scores->dims())}}); - runner_fill_ninf.Run(stream); - - // Step2: calculate topk scores - - // get scores used in topk op - Tensor tmp_scores(scores->dtype()); - tmp_scores.mutable_data(scores->dims(), place); - if (!is_accumulated) { - // if pre_id == end_id, cal_scores = pre_score, and id = end_id - // else, cal_score = pre_score + log(score) - - // calculate log(scores) - Tensor log_scores(scores->dtype()); - log_scores.mutable_data(scores->dims(), place); - - Tensor one(scores->dtype()); - one.mutable_data(scores->dims(), place); - const auto& runner_one = NpuOpRunner("OnesLike", {*scores}, {one}, {}); - runner_one.Run(stream); - - Tensor sub(scores->dtype()); - sub.mutable_data(scores->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {*scores, one}, {sub}, {}); - runner_sub.Run(stream); - - const auto& runner_log_scores = - NpuOpRunner("Log1p", {sub}, {log_scores}, {}); - runner_log_scores.Run(stream); - - // tmp_scores = pre_score + log(scores) - const auto& runner_add_scores = - NpuOpRunner("Add", {log_scores, *pre_scores}, {tmp_scores}, {}); - runner_add_scores.Run(stream); - - // if pre_ids == end_ids, use pre_score rather than score - const auto& runner_select_equal_end_score = - NpuOpRunner("Select", - {equal_end_ids, expand_pre_scores, tmp_scores}, - {tmp_scores}, - {}); - runner_select_equal_end_score.Run(stream); - } else { - // if pre_ids == end_ids, use pre_score rather than score - const auto& runner_select_equal_end_score2 = - NpuOpRunner("Select", - {equal_end_ids, expand_pre_scores, *scores}, - {tmp_scores}, - {}); - runner_select_equal_end_score2.Run(stream); - } - - // if pre_ids == end_ids, save only one score, and others become -inf - Tensor cal_scores(scores->dtype()); - cal_scores.mutable_data(scores->dims(), place); - const auto& runner_select_inf_score = - NpuOpRunner("Select", - {save_one_end_score, ninf_tensors, tmp_scores}, - {cal_scores}, - {}); - runner_select_inf_score.Run(stream); - - // resize scores from [num_seqs * beam_size, K] to [num_seqs, beam_size * K] - // real_beam_size = 1 or beam_size - cal_scores.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width})); - - Tensor topk_scores(scores->dtype()); - topk_scores.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - topk_scores.mutable_data(ctx.GetPlace()); - - Tensor tmp_indices(phi::DataType::INT32); - tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - tmp_indices.mutable_data(ctx.GetPlace()); - - // run topk op - NpuOpRunner runner_topk; - runner_topk.SetType("TopKV2") - .AddInput(cal_scores) - .AddInput(std::vector{static_cast(beam_size)}) - .AddOutput(topk_scores) - .AddOutput(tmp_indices) - .AddAttr("sorted", true) - .AddAttr("dim", -1) - .AddAttr("largest", true); - runner_topk.Run(stream); - - // cast tmp_indices from int to float32 for Sort op - Tensor cast_tmp_indices(phi::DataType::FLOAT32); - cast_tmp_indices.Resize(tmp_indices.dims()); - cast_tmp_indices.mutable_data(ctx.GetPlace()); - auto dst_dtype_tmp_indices_fp32 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_tmp_indices.type())); - const auto& runner_cast_tmp_indices = NpuOpRunner( - "Cast", - {tmp_indices}, - {cast_tmp_indices}, - {{"dst_type", static_cast(dst_dtype_tmp_indices_fp32)}}); - runner_cast_tmp_indices.Run(stream); - - // sort tmp_indices - Tensor sorted_tmp_indices(phi::DataType::FLOAT32); - sorted_tmp_indices.Resize(tmp_indices.dims()); - sorted_tmp_indices.mutable_data(ctx.GetPlace()); - Tensor sorted_score_indices(phi::DataType::INT32); - sorted_score_indices.Resize(tmp_indices.dims()); - sorted_score_indices.mutable_data(ctx.GetPlace()); - const auto& runner_sort_tmp_indices = - NpuOpRunner("Sort", - {cast_tmp_indices}, - {sorted_tmp_indices, sorted_score_indices}, - {{"axis", 1}, {"descending", false}}); - runner_sort_tmp_indices.Run(stream); - - // cast sorted_tmp_indices from float32 to int - Tensor cast_sort_tmp_indices(phi::DataType::INT32); - cast_sort_tmp_indices.Resize(sorted_tmp_indices.dims()); - cast_sort_tmp_indices.mutable_data(ctx.GetPlace()); - auto dst_dtype_tmp_indices_int32 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_sort_tmp_indices.type())); - const auto& runner_cast_sort_tmp_indices = NpuOpRunner( - "Cast", - {sorted_tmp_indices}, - {cast_sort_tmp_indices}, - {{"dst_type", static_cast(dst_dtype_tmp_indices_int32)}}); - runner_cast_sort_tmp_indices.Run(stream); - - // Step 3: infer selected ids from tmp_indices and ids - - // if pre_ids == end_ids, use pre_ids rather than ids - Tensor cal_ids(ids_int32.dtype()); - cal_ids.mutable_data(ids_int32.dims(), place); - const auto& runner_select_equal_end_id = NpuOpRunner( - "Select", {equal_end_ids, expand_pre_ids, ids_int32}, {cal_ids}, {}); - runner_select_equal_end_id.Run(stream); - - // resize ids from [num_seqs * real_beam_size, K] to [num_seqs, - // real_beam_size * K] - // real_beam_size = 1 or beam_size - cal_ids.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width})); - - // construct batch_ids like [[0, 0, 0], [1, 1, 1], ..., [bs-1, bs-1, bs-1]] - // construct arange(num_seqs*beam_size).reshape((num_seqs, beam_size)) // - // beam_size - Tensor batch_ids(phi::DataType::INT32); - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - batch_ids.mutable_data(place); - - std::vector vector_batch_ids; - for (int i = 0; i < num_seqs * static_cast(beam_size); ++i) { - vector_batch_ids.push_back(static_cast(i / beam_size)); - } - framework::TensorFromVector(vector_batch_ids, ctx, &batch_ids); - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - - // sort topk_scores to get selected_scores - // get indices of gather_nd op for calculating selected_scores - Tensor gather_nd_score_indices(phi::DataType::INT32); - gather_nd_score_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 2})); - gather_nd_score_indices.mutable_data(place); - - sorted_score_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - std::vector concat_inputs2 = {batch_ids, - sorted_score_indices}; - std::vector concat_names = {"x0", "x1"}; - NpuOpRunner runner_concat_score_indices{"ConcatD", - {concat_inputs2}, - {gather_nd_score_indices}, - {{"concat_dim", 2}, {"N", 2}}}; - runner_concat_score_indices.AddInputNames(concat_names); - runner_concat_score_indices.Run(stream); - - // use gather_nd to get selected_scores - const auto& runner_gather_nd_scores = - NpuOpRunner("GatherNd", - {topk_scores, gather_nd_score_indices}, - {*selected_scores}, - {}); - runner_gather_nd_scores.Run(stream); - - // get indices of gather_nd op - cast_sort_tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - Tensor gather_nd_id_indices(phi::DataType::INT32); - gather_nd_id_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 2})); - gather_nd_id_indices.mutable_data(place); - - std::vector concat_inputs3 = {batch_ids, - cast_sort_tmp_indices}; - NpuOpRunner runner_concat_id_indices{"ConcatD", - {concat_inputs3}, - {gather_nd_id_indices}, - {{"concat_dim", 2}, {"N", 2}}}; - runner_concat_id_indices.AddInputNames(concat_names); - runner_concat_id_indices.Run(stream); - - // use gather_nd to get selected_ids - Tensor topk_ids(phi::DataType::INT32); - topk_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - topk_ids.mutable_data(ctx.GetPlace()); - - const auto& runner_gather_nd_ids = NpuOpRunner( - "GatherNd", {cal_ids, gather_nd_id_indices}, {topk_ids}, {}); - runner_gather_nd_ids.Run(stream); - - // cast topk_ids from int to int64 to get selected_ids - auto dst_dtype_selected_ids = - ConvertToNpuDtype(framework::TransToProtoVarType(selected_ids->type())); - const auto& runner_cast_selected_ids = - NpuOpRunner("Cast", - {topk_ids}, - {*selected_ids}, - {{"dst_type", static_cast(dst_dtype_selected_ids)}}); - runner_cast_selected_ids.Run(stream); - - // TODO(pangyoki): PruneEndBeams - - // Step 4: set lod of output Tensor - // define Tensor with value `seq_width` - Tensor seq_width_tensor(phi::DataType::INT32); - seq_width_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&seq_width_tensor, - static_cast(seq_width)); - - // beam_ids = tmp_indices // seq_width - Tensor beam_ids(phi::DataType::INT32); - beam_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - beam_ids.mutable_data(ctx.GetPlace()); - cast_sort_tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - - const auto& runner_div = NpuOpRunner( - "Div", {cast_sort_tmp_indices, seq_width_tensor}, {beam_ids}, {}); - runner_div.Run(stream); - - // get parent_idx by adding batch_ids to beam_ids - // construct scale_batch_ids like [[0, 0, 0], [bw, bw, bw], ..., [bs-1*bw, - // bs-1*bw, bs-1*bw]] - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - - // cast batch_ids from int to float32 - Tensor cast_batch_ids(phi::DataType::FLOAT32); - cast_batch_ids.Resize(batch_ids.dims()); - cast_batch_ids.mutable_data(ctx.GetPlace()); - auto dst_dtype1 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_batch_ids.type())); - const auto& runner_cast_batch_ids = - NpuOpRunner("Cast", - {batch_ids}, - {cast_batch_ids}, - {{"dst_type", static_cast(dst_dtype1)}}); - runner_cast_batch_ids.Run(stream); - - // scale batch_ids with beam_size - Tensor scale_batch_ids(phi::DataType::FLOAT32); - scale_batch_ids.Resize(batch_ids.dims()); - scale_batch_ids.mutable_data(place); - const auto& runner_power = - NpuOpRunner("Power", - {cast_batch_ids}, - {scale_batch_ids}, - {{"power", static_cast(1.0)}, - {"scale", static_cast(beam_size)}, - {"shift", static_cast(0.0)}}); - runner_power.Run(stream); - - // cast cast_scale_batch_ids from float32 to int - Tensor cast_scale_batch_ids(phi::DataType::INT32); - cast_scale_batch_ids.Resize(scale_batch_ids.dims()); - cast_scale_batch_ids.mutable_data(ctx.GetPlace()); - auto dst_dtype2 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_scale_batch_ids.type())); - const auto& runner_cast_scale_batch_ids = - NpuOpRunner("Cast", - {scale_batch_ids}, - {cast_scale_batch_ids}, - {{"dst_type", static_cast(dst_dtype2)}}); - runner_cast_scale_batch_ids.Run(stream); - - // calculate parent_idx - Tensor tmp_parent_idx(phi::DataType::INT32); - tmp_parent_idx.Resize(parent_idx->dims()); - tmp_parent_idx.mutable_data(place); - const auto& runner_add_beam_id = NpuOpRunner( - "Add", {beam_ids, cast_scale_batch_ids}, {tmp_parent_idx}, {}); - runner_add_beam_id.Run(stream); - - // cast tmp_parent_idx from int to int64 to get parent_idx - auto dst_dtype_parent_idx = - ConvertToNpuDtype(framework::TransToProtoVarType(parent_idx->type())); - const auto& runner_cast_parent_idx = - NpuOpRunner("Cast", - {tmp_parent_idx}, - {*parent_idx}, - {{"dst_type", static_cast(dst_dtype_parent_idx)}}); - runner_cast_parent_idx.Run(stream); - - std::vector vector_parent_idx; - framework::TensorToVector(tmp_parent_idx, ctx, &vector_parent_idx); - - // set low level, len(low_level) = high_level[-1] - std::vector low_level; - std::vector num_parent_ids(num_seqs * beam_size, - static_cast(0)); - size_t low_level_size = high_level[num_seqs]; - size_t sum_parent_id = 0; - - // calculate number of every parent_id - for (size_t i = 0; i < num_seqs * beam_size; ++i) { - num_parent_ids[vector_parent_idx[i]]++; - } - - // update low_level - low_level.push_back(0); - for (size_t i = 0; i < low_level_size; ++i) { - sum_parent_id += num_parent_ids[i]; - low_level.push_back(sum_parent_id); - } - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - if (!framework::CheckLoD(lod)) { - PADDLE_THROW(platform::errors::InvalidArgument( - "lod %s is not right in" - " beam_search, please check your code.", - framework::LoDToString(lod))); - } - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); - } -}; - -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc deleted file mode 100644 index 094f39366ab35..0000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -template -class AccuracyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* label = ctx.Input("Label"); - auto* indices = ctx.Input("Indices"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - auto stream = - ctx.template device_context() - .stream(); - - int num_samples = inference->dims()[0]; - if (num_samples == 0) { - return; - } - - // cast `indices` or `label` if their type is not consistent - Tensor cast_indices(phi::DataType::INT32); - Tensor cast_label(phi::DataType::INT32); - if (indices->dtype() != label->dtype()) { - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - if (framework::TransToProtoVarType(indices->dtype()) != - framework::proto::VarType::INT32) { - cast_indices.Resize(indices->dims()); - cast_indices.mutable_data(ctx.GetPlace()); - const auto& runner_cast_indices = - NpuOpRunner("Cast", - {*indices}, - {cast_indices}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_indices.Run(stream); - } else { - cast_indices.ShareDataWith(*indices); - } - if (framework::TransToProtoVarType(label->dtype()) != - framework::proto::VarType::INT32) { - cast_label.Resize(label->dims()); - cast_label.mutable_data(ctx.GetPlace()); - const auto& runner_cast_label = - NpuOpRunner("Cast", - {*label}, - {cast_label}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_label.Run(stream); - } else { - cast_label.ShareDataWith(*label); - } - } else { - cast_indices.ShareDataWith(*indices); - cast_label.ShareDataWith(*label); - } - - // equal - Tensor tmp_equal(phi::DataType::BOOL); - tmp_equal.Resize(inference->dims()); - tmp_equal.mutable_data(ctx.GetPlace()); - const auto& runner_equal = - NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {}); - runner_equal.Run(stream); - - // cast equal - Tensor tmp_equal_cast(phi::DataType::FLOAT32); - tmp_equal_cast.Resize(inference->dims()); - tmp_equal_cast.mutable_data(ctx.GetPlace()); - const auto& runner_cast_equal = NpuOpRunner( - "Cast", - {tmp_equal}, - {tmp_equal_cast}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(tmp_equal_cast.dtype())))}}); - runner_cast_equal.Run(stream); - - // [correct] - // reduce_max - Tensor tmp_correct_max(phi::DataType::FLOAT32); - tmp_correct_max.Resize(phi::make_ddim({num_samples})); - tmp_correct_max.mutable_data(ctx.GetPlace()); - const auto& runner_reduce_max = - NpuOpRunner("ReduceMaxD", - {tmp_equal_cast}, - {tmp_correct_max}, - {{"axes", std::vector{1}}, {"keep_dims", false}}); - runner_reduce_max.Run(stream); - - // reduce_sum - Tensor tmp_correct(phi::DataType::FLOAT32); - tmp_correct.Resize(correct->dims()); - tmp_correct.mutable_data(ctx.GetPlace()); - const auto& runner_reduce_sum = - NpuOpRunner("ReduceSumD", - {tmp_correct_max}, - {tmp_correct}, - {{"axes", std::vector{0}}, {"keep_dims", false}}); - runner_reduce_sum.Run(stream); - - // cast to int - correct->mutable_data(ctx.GetPlace()); - const auto& runner_cast_correct = - NpuOpRunner("Cast", - {tmp_correct}, - {*correct}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(correct->dtype())))}}); - runner_cast_correct.Run(stream); - - // [total] - total->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(total, static_cast(num_samples)); - - // use `total` of type `float32` for calculating accuracy - Tensor tmp_total(phi::DataType::FLOAT32); - tmp_total.Resize(total->dims()); - tmp_total.mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(&tmp_total, - static_cast(num_samples)); - - // [accuracy] - accuracy->mutable_data(ctx.GetPlace()); - const auto& runner_accuracy = - NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {}); - runner_accuracy.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - accuracy, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc deleted file mode 100644 index 3324e56b3b95f..0000000000000 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ /dev/null @@ -1,345 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class AdamNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - auto* param = ctx.Input("Param"); - auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Grad(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(param_var->Type()))); - auto* grad = ctx.Input("Grad"); - auto* mom1 = ctx.Input("Moment1"); - auto* mom2 = ctx.Input("Moment2"); - auto* lr = ctx.Input("LearningRate"); - - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); - - auto* param_out = ctx.Output("ParamOut"); - auto* mom1_out = ctx.Output("Moment1Out"); - auto* mom2_out = ctx.Output("Moment2Out"); - auto* beta1_pow_out = ctx.Output("Beta1PowOut"); - auto* beta2_pow_out = ctx.Output("Beta2PowOut"); - - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector( - *skip_update_tensor, ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update) { - VLOG(4) << "Adam skip update"; - framework::TensorCopy( - *param, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - framework::TensorCopy( - *mom1, - ctx.GetPlace(), - ctx.template device_context(), - mom1_out); - framework::TensorCopy( - *mom2, - ctx.GetPlace(), - ctx.template device_context(), - mom2_out); - framework::TensorCopy( - *beta1_pow, - beta1_pow->place(), - ctx.template device_context(), - beta1_pow_out); - framework::TensorCopy( - *beta2_pow, - beta2_pow->place(), - ctx.template device_context(), - beta2_pow_out); - return; - } - - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - param_out->mutable_data(ctx.GetPlace()); - mom1_out->mutable_data(ctx.GetPlace()); - mom2_out->mutable_data(ctx.GetPlace()); - - // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform - // place. - phi::DenseTensor beta1_pow_tmp; - phi::DenseTensor beta2_pow_tmp; - if (beta1_pow->place() == platform::CPUPlace()) { - T beta1 = *beta1_pow->data(); - beta1_pow_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta1_pow_tmp, beta1); - beta1_pow = &beta1_pow_tmp; - } - if (beta2_pow->place() == platform::CPUPlace()) { - T beta2 = *beta2_pow->data(); - beta2_pow_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta2_pow_tmp, beta2); - beta2_pow = &beta2_pow_tmp; - } - - const phi::DenseTensor* beta1_tensor = nullptr; - const phi::DenseTensor* beta2_tensor = nullptr; - const phi::DenseTensor* epsilon_tensor = nullptr; - - phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32); - phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32); - phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32); - - if (ctx.HasInput("Beta1Tensor")) { - beta1_tensor = ctx.Input("Beta1Tensor"); - PADDLE_ENFORCE_EQ(beta1_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(Beta1Tensor) size must be 1, but get %d", - beta1_tensor->numel())); - } else { - T beta1 = static_cast(ctx.Attr("beta1")); - beta1_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta1_tmp, beta1); - beta1_tensor = &beta1_tmp; - } - - if (ctx.HasInput("Beta2Tensor")) { - beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(Beta2Tensor) size must be 1, but get %d", - beta2_tensor->numel())); - } else { - T beta2 = static_cast(ctx.Attr("beta2")); - beta2_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta2_tmp, beta2); - beta2_tensor = &beta2_tmp; - } - - if (ctx.HasInput("EpsilonTensor")) { - epsilon_tensor = ctx.Input("EpsilonTensor"); - PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(EpsilonTensor) size must be 1, but get %d", - epsilon_tensor->numel())); - } else { - T epsilon = static_cast(ctx.Attr("epsilon")); - epsilon_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&epsilon_tmp, epsilon); - epsilon_tensor = &epsilon_tmp; - } - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() - << "beta2_pow.numel() : " << beta2_pow->numel(); - VLOG(3) << "param.numel(): " << param->numel(); - - PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), - 1, - platform::errors::InvalidArgument( - "beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), - 1, - platform::errors::InvalidArgument( - "beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ApplyAdamD", - { - *param, - *mom1, - *mom2, - *beta1_pow, - *beta2_pow, - *lr, - *beta1_tensor, - *beta2_tensor, - *epsilon_tensor, - *grad, - }, - { - *param_out, - *mom1_out, - *mom2_out, - }, - {}); - runner.Run(stream); - - // NOTE(zhiqiu): ApplyAdamD updates params inplace, so - // if param and param_out is not same, we need to do copy. - if (param_out->data() != param->data()) { - framework::TensorCopy( - *param, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - } - if (mom1_out->data() != mom1->data()) { - framework::TensorCopy( - *mom1, - ctx.GetPlace(), - ctx.template device_context(), - mom1_out); - } - if (mom2_out->data() != mom2->data()) { - framework::TensorCopy( - *mom2, - ctx.GetPlace(), - ctx.template device_context(), - mom2_out); - } - if (!use_global_beta_pow) { - beta1_pow_out->mutable_data(ctx.GetPlace()); - beta2_pow_out->mutable_data(ctx.GetPlace()); - const auto& runner_m1 = - NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); - runner_m1.Run(stream); - const auto& runner_m2 = - NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); - runner_m2.Run(stream); - } - } -}; - -template -class AdamWNPUKernel : public AdamNPUKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - VLOG(3) << "NPU AdamW Kernel"; - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - VLOG(3) << "Has SkipUpdate"; - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector( - *skip_update_tensor, ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update; - bool with_decay = ctx.Attr("with_decay"); - if (!skip_update && with_decay) { - float coeff = ctx.Attr("coeff"); - auto* lr = ctx.Input("LearningRate"); - - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor one(phi::DataType::FLOAT32); - phi::DenseTensor decay(phi::DataType::FLOAT32); - phi::DenseTensor tmp(phi::DataType::FLOAT32); - - tmp.mutable_data({1}, place); - one.mutable_data({1}, place); - decay.mutable_data({1}, place); - - FillNpuTensorWithConstant(&one, 1.0f); - framework::NPUAttributeMap attr_input = {{"value", coeff}}; - - const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input); - runner1.Run(stream); - - const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {}); - runner2.Run(stream); - - if (ctx.HasInput("MasterParam")) { - PADDLE_THROW(platform::errors::Unimplemented( - "Master Parma is not supported on npu")); - } else { - auto* param_out = ctx.Output("ParamOut"); - param_out->mutable_data(ctx.GetPlace()); - - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - auto* param = ctx.Input("Param"); - - const auto& runner = - NpuOpRunner("Mul", - {*param, decay}, - {*const_cast(param)}, - {}); - runner.Run(stream); - } - } - AdamNPUKernel::Compute(ctx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - adam, - ops::AdamNPUKernel, - ops::AdamNPUKernel); - -REGISTER_OP_NPU_KERNEL(adamw, - ops::AdamWNPUKernel, - ops::AdamWNPUKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc deleted file mode 100644 index 83c805a1f642a..0000000000000 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" - -namespace paddle { -namespace operators { - -template -class NPUMergedMomentumOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto params = ctx.MultiInput("Param"); - auto params_out = ctx.MultiOutput("ParamOut"); - size_t n = params.size(); - PADDLE_ENFORCE_EQ(n, - params_out.size(), - platform::errors::InvalidArgument( - "The size of Output(ParamOut) must be equal to " - "Input(Param), but got the size of Output(ParamOut) " - "is %d, the size of Input(Param) is %d.", - params_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(params[i], - params_out[i], - platform::errors::InvalidArgument( - "The size of Input(Param) and Output(ParamOut) " - "must be the same Tensors.")); - } - - auto grads = ctx.MultiInput("Grad"); - PADDLE_ENFORCE_EQ( - n, - grads.size(), - platform::errors::InvalidArgument( - "The size of Input(Grad) must be equal to Input(Param), but got " - "the size of Input(Grad) is %d, the size of Input(Param) is %d.", - grads.size(), - n)); - - auto velocitys = ctx.MultiInput("Velocity"); - PADDLE_ENFORCE_EQ(n, - velocitys.size(), - platform::errors::InvalidArgument( - "The size of Input(Velocity) must be equal to " - "Input(Param), but got the size of Input(Velocity) " - "is %d, the size of Input(Param) is %d.", - velocitys.size(), - n)); - - auto velocitys_out = ctx.MultiOutput("VelocityOut"); - PADDLE_ENFORCE_EQ( - n, - velocitys_out.size(), - platform::errors::InvalidArgument( - "The size of Output(VelocityOut) must be " - "equal to Input(Param), but got the size of Output(VelocityOut) is " - "%d, the size of Input(Param) is %d.", - velocitys_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(velocitys[i], - velocitys_out[i], - platform::errors::InvalidArgument( - "Input(Velocity) and Output(VelocityOut) must be " - "the same Tensors.")); - } - - T mu = static_cast(ctx.Attr("mu")); - auto lrs = ctx.MultiInput("LearningRate"); - if (lrs.size() != 1) { - PADDLE_ENFORCE_EQ( - n, - lrs.size(), - platform::errors::InvalidArgument( - "If the size of Input(LearningRate) is not 1, the size of " - "Input(LearningRate) must be " - "equal to Input(Param), but got the size of Input(LearningRate) " - "is %d, the size of Input(Param) is %d.", - lrs.size(), - n)); - } - auto use_nesterov = ctx.Attr("use_nesterov"); - auto regularization_methods = - ctx.Attr>("regularization_method"); - auto regularization_coeffs = - ctx.Attr>("regularization_coeff"); - if (regularization_methods.size() != 0) { - PADDLE_ENFORCE_EQ( - n, - regularization_methods.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_method) must be equal " - "to Input(Param), but got the size of " - "Attr(regularization_method) is %d, the size of Input(Param) is " - "%d.", - regularization_methods.size(), - n)); - PADDLE_ENFORCE_EQ( - n, - regularization_coeffs.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_coeff) must be equal " - "to Input(Param), but got the size of Attr(regularization_coeff) " - "is %d, the size of Input(Param) is %d.", - regularization_coeffs.size(), - n)); - } - - VLOG(5) << "use_nesterov: " << use_nesterov - << ", regularization_methods.size(): " - << regularization_methods.size() - << ", regularization_coeffs.size(): " - << regularization_coeffs.size(); - - auto& dev_ctx = ctx.template device_context(); - - Tensor mu_tensor; - mu_tensor.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); - FillNpuTensorWithConstant(&mu_tensor, mu); - - for (size_t idx = 0; idx < n; ++idx) { - phi::RegularizationType regularization_flag = - regularization_methods.size() > 0 && - regularization_methods[idx] == "l2_decay" - ? phi::RegularizationType::kL2DECAY - : phi::RegularizationType::kNONE; - float regularization_coeff = 0.0; - if (regularization_coeffs.size() != 0) { - regularization_coeff = regularization_coeffs[idx]; - } - - auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; - auto param = params[idx]; - auto param_out = params_out[idx]; - auto velocity = velocitys[idx]; - auto velocity_out = velocitys_out[idx]; - - auto grad = grads[idx]; - Tensor regularized_grad; - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - regularized_grad.mutable_data(grad->dims(), ctx.GetPlace()); - const auto& runner1 = NpuOpRunner("Muls", - {*param}, - {regularized_grad}, - {{"value", regularization_coeff}}); - runner1.Run(dev_ctx.stream()); - const auto& runner2 = NpuOpRunner( - "Add", {regularized_grad, *grad}, {regularized_grad}, {}); - runner2.Run(dev_ctx.stream()); - } else { - regularized_grad.ShareDataWith(*grad); - } - framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); - framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); - // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner("ApplyMomentum", - {*param_out, - *velocity_out, - *learning_rate, - regularized_grad, - mu_tensor}, - {*param_out}, - {{"use_nesterov", use_nesterov}}); - runner.Run(dev_ctx.stream()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(merged_momentum, - ops::NPUMergedMomentumOpKernel, - ops::NPUMergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc deleted file mode 100644 index a5349e05b9b02..0000000000000 --- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/optimizers/momentum_op.h" -#include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" - -namespace paddle { -namespace operators { - -template -class NPUMomentumOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - - std::string regularization_method = - ctx.Attr("regularization_method"); - auto regularization_coeff = ctx.Attr("regularization_coeff"); - phi::RegularizationType regularization_flag{ - phi::RegularizationType::kNONE}; // disable regularization - if (regularization_method == "l2_decay") { - regularization_flag = phi::RegularizationType::kL2DECAY; - } - - T mu = static_cast(ctx.Attr("mu")); - bool use_nesterov = ctx.Attr("use_nesterov"); - - auto learning_rate = ctx.Input("LearningRate"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); - - auto* grad_var = ctx.InputVar("Grad"); - if (grad_var->IsType()) { - auto grad = ctx.Input("Grad"); - Tensor mu_tensor; - mu_tensor.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); - FillNpuTensorWithConstant(&mu_tensor, mu); - - Tensor regularized_grad; - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - regularized_grad.mutable_data(grad->dims(), ctx.GetPlace()); - const auto& runner1 = NpuOpRunner("Muls", - {*param}, - {regularized_grad}, - {{"value", regularization_coeff}}); - runner1.Run(dev_ctx.stream()); - const auto& runner2 = NpuOpRunner( - "Add", {regularized_grad, *grad}, {regularized_grad}, {}); - runner2.Run(dev_ctx.stream()); - } else { - regularized_grad.ShareDataWith(*grad); - } - framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); - framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); - // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner("ApplyMomentum", - {*param_out, - *velocity_out, - *learning_rate, - regularized_grad, - mu_tensor}, - {*param_out}, - {{"use_nesterov", use_nesterov}}); - runner.Run(dev_ctx.stream()); - } else if (grad_var->IsType()) { - PADDLE_ENFORCE_EQ( - false, - true, - platform::errors::PermissionDenied("Unsupport SparseMomentum")); - } else { - PADDLE_ENFORCE_EQ(false, - true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad " - "in MomentumOp. Excepted LodTensor " - "or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(momentum, - ops::NPUMomentumOpKernel, - ops::NPUMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc deleted file mode 100644 index 6ee01272f47e8..0000000000000 --- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class RMSPROPNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *grad_var = ctx.InputVar("Grad"); - auto *param_out = ctx.Output("ParamOut"); - auto *moment_out = ctx.Output("MomentOut"); - auto *mean_square_out = ctx.Output("MeanSquareOut"); - - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - mean_square_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - auto rho = static_cast(ctx.Attr("decay")); - auto momentum = static_cast(ctx.Attr("momentum")); - auto *p_tensor = ctx.Input("Param"); - auto *ms_tensor = ctx.Input("MeanSquare"); - auto *lr_tensor = ctx.Input("LearningRate"); - auto *mom_tensor = ctx.Input("Moment"); - bool centered = ctx.Attr("centered"); - - auto stream = - ctx.template device_context() - .stream(); - if (grad_var->IsType()) { - auto *grad_tensor = ctx.Input("Grad"); - if (centered) { - framework::NPUAttributeMap attr_input = {{"use_locking", false}}; - const phi::DenseTensor *rho_tensor = nullptr; - const phi::DenseTensor *momentum_tensor = nullptr; - const phi::DenseTensor *epsilon_tensor = nullptr; - phi::DenseTensor rho_tmp(phi::DataType::FLOAT32); - rho_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&rho_tmp, rho); - rho_tensor = &rho_tmp; - phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32); - momentum_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&momentum_tmp, momentum); - momentum_tensor = &momentum_tmp; - phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32); - epsilon_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&epsilon_tmp, epsilon); - epsilon_tensor = &epsilon_tmp; - auto *mg_tensor = ctx.Input("MeanGrad"); - auto *mean_grad_out = ctx.Output("MeanGradOut"); - mean_grad_out->mutable_data(ctx.GetPlace()); - const auto &runner_applycenterrmsprop = NpuOpRunner( - std::string("ApplyCenteredRMSPropD"), - {*p_tensor, - *mg_tensor, - *ms_tensor, - *mom_tensor, - *lr_tensor, - *rho_tensor, - *momentum_tensor, - *epsilon_tensor, - *grad_tensor}, - {*param_out, *mean_grad_out, *mean_square_out, *moment_out}, - {attr_input}); - runner_applycenterrmsprop.Run(stream); - } else { - framework::NPUAttributeMap attr_input = { - {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}}; - const auto &runner_applyrmsprop = NpuOpRunner( - std::string("ApplyRMSPropD"), - {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor}, - {*param_out, *mean_square_out, *moment_out}, - {attr_input}); - runner_applyrmsprop.Run(stream); - } - } else { - PADDLE_ENFORCE_EQ(false, - true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad " - "in RmspropOp. Excepted LodTensor, " - "But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - rmsprop, ops::RMSPROPNPUKernel) diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc deleted file mode 100644 index 7bd5cf8793cd0..0000000000000 --- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/optimizers/sgd_op.h" - -namespace paddle { -namespace operators { - -template -class SGDNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* learning_rate = ctx.Input("LearningRate"); - auto* param_var = ctx.Input("Param"); - auto* grad_var = ctx.Input("Grad"); - auto* param_out = ctx.Output("ParamOut"); - - param_out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("ApplyGradientDescent", - {*param_var, *learning_rate, *grad_var}, - {*param_out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so - // if param and param_out is not same, we need to do copy. - if (param_out->data() != param_var->data()) { - framework::TensorCopy( - *param_var, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - sgd, - ops::SGDNPUKernel, - ops::SGDNPUKernel, - ops::SGDNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc deleted file mode 100644 index 7ec3183d412d4..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class ReduceAnyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - bool keep_dim = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - out->mutable_data(ctx.GetPlace()); - - // set attr - NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}}; - - const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc deleted file mode 100644 index aec1640181bcc..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(reduce_any); -USE_OP_DEVICE_KERNEL(reduce_any, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - std::vector init_x = {true, false, false, false}; - f::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({2})); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - std::vector axes; - f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}}; - auto op = f::OpRegistry::CreateOp( - "reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - - ctx.Wait(); - - std::vector out_vec; - f::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - std::vector expected_vec = {true}; - EXPECT_EQ(out_vec.size(), expected_vec.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], expected_vec[i]); - } -} - -TEST(reduce_any, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc deleted file mode 100644 index de4049c7e7f97..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - const auto& dev_ctx = - ctx.template device_context(); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT64) { - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = - NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs); - runner.Run(dev_ctx.stream()); - }; - - NpuOpRunner::TypeAdapter({*x}, - {cast_out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = - NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input); - runner.Run(dev_ctx.stream()); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(dev_ctx.stream()); - } - } -}; - -template -class ReduceMaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Input("Out"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto reduce_dims = context.Attr>("dim"); - bool reduce_all = context.Attr("reduce_all"); - int in_dtype = context.Attr("in_dtype"); - - PADDLE_ENFORCE_EQ( - in_dtype == -1, - true, - platform::errors::InvalidArgument( - "NPU only support in_dtype == -1 in reduce_max_grad op.")); - - auto* x_grad = - context.Output(framework::GradVarName("X")); - x_grad->mutable_data(context.GetPlace()); - - auto& dev_ctx = - context.template device_context(); - auto place = context.GetPlace(); - auto stream = dev_ctx.stream(); - - // broadcast - auto x_dims_vec = phi::vectorize(x->dims()); - if (reduce_all) { - reduce_dims.clear(); - for (size_t d = 0; d < x_dims_vec.size(); ++d) { - reduce_dims.push_back(static_cast(d)); - } - } - - phi::DenseTensor tmp_out, tmp_out_grad; - auto tmp_out_dims_vec = x_dims_vec; - for (auto d : reduce_dims) { - if (d < 0) { - d += x_dims_vec.size(); - } - tmp_out_dims_vec[d] = 1; - } - - tmp_out.ShareDataWith(*out); - tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec)); - tmp_out_grad.ShareDataWith(*out_grad); - tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec)); - - phi::DenseTensor transformed_out(x->type()); - transformed_out.Resize(phi::make_ddim(x_dims_vec)); - transformed_out.mutable_data(place); - NpuOpRunner r_brd_out; - r_brd_out.SetType("BroadcastTo") - .AddInput(tmp_out) - .AddInput(std::move(x_dims_vec)) - .AddOutput(transformed_out) - .Run(stream); - phi::DenseTensor transformed_out_grad(x->type()); - transformed_out_grad.Resize(phi::make_ddim(x_dims_vec)); - transformed_out_grad.mutable_data(place); - NpuOpRunner r_brd_out_grad; - r_brd_out_grad.SetType("BroadcastTo") - .AddInput(tmp_out_grad) - .AddInput(std::move(x_dims_vec)) - .AddOutput(transformed_out_grad) - .Run(stream); - - // compare - phi::DenseTensor equal_cond; - equal_cond.mutable_data(x_grad->dims(), place); - const auto& r_equal = - NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {}); - r_equal.Run(stream); - - // select - phi::DenseTensor t_zero; - t_zero.mutable_data(x_grad->dims(), place); - FillNpuTensorWithConstant(&t_zero, static_cast(0)); - t_zero.Resize(x_grad->dims()); - - const auto& r_sel = NpuOpRunner( - "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {}); - r_sel.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_max, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel); -REGISTER_OP_NPU_KERNEL( - reduce_max_grad, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc deleted file mode 100644 index 65fabbd21cb7e..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" - -namespace paddle { -namespace operators { - -template -class NPUReduceMeanOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - - bool reduce_all = ctx.Attr("reduce_all"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - - auto input_dims = input->dims(); - if (reduce_all) { - dims.clear(); - for (int i = 0; i < input_dims.size(); i++) { - dims.push_back(static_cast(i)); - } - } - - auto stream = - ctx.template device_context() - .stream(); - - NpuOpRunner runner; - runner.SetType("ReduceMean") - .AddInput(*input) - .AddInput(std::move(dims)) - .AddOutput(*output) - .AddAttrs({{"keep_dims", keep_dim}}) - .Run(stream); - } -}; - -template -class NPUReduceMeanGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - - bool reduce_all = ctx.Attr("reduce_all"); - auto reduce_dims = ctx.Attr>("dim"); - auto input_dims = input->dims(); - - int reduce_numel = 1; - if (reduce_all) { - reduce_dims.clear(); - for (int d = 0; d < input_dims.size(); ++d) { - reduce_dims.push_back(static_cast(d)); - } - } - for (auto& d : reduce_dims) { - if (d < 0) { - d = d + input_dims.size(); - } - reduce_numel *= input_dims[d]; - } - - phi::DenseTensor tensor_value(input_grad->dtype()); - tensor_value.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant( - &tensor_value, static_cast(1.0f / static_cast(reduce_numel))); - - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(input_dims)) - .AddInput(tensor_value) - .AddOutput(*input_grad) - .Run(stream); - - phi::DenseTensor transformed_input_grad, transformed_out_grad; - phi::DenseTensor tmp_output_grad; - auto tmp_output_dims = input_dims; - for (auto d : reduce_dims) { - tmp_output_dims[d] = 1; - } - tmp_output_grad.ShareDataWith(*output_grad); - tmp_output_grad.Resize(tmp_output_dims); - auto& dev_ctx = - ctx.template device_context(); - NpuElementWiseOpBroadcast(dev_ctx, - input_grad, - &tmp_output_grad, - 0, - &transformed_input_grad, - &transformed_out_grad); - const auto& runner2 = - NpuOpRunner("Mul", - {transformed_input_grad, transformed_out_grad}, - {*input_grad}, - {}); - runner2.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel); -REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc deleted file mode 100644 index e4adc42283120..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceMinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - const auto& dev_ctx = - ctx.template device_context(); - if (x->dtype() == phi::DataType::INT64) { - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = - NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs); - runner.Run(dev_ctx.stream()); - }; - - NpuOpRunner::TypeAdapter({*x}, - {cast_out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = - NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input); - runner.Run(dev_ctx.stream()); - } - - if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_min, - ops::ReduceMinNPUKernel, - ops::ReduceMinNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceMinNPUKernel, -#endif - ops::ReduceMinNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc deleted file mode 100644 index fd9bf28b60793..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceProdNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input); - runner.Run(stream); - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_prod, - ops::ReduceProdNPUKernel, - ops::ReduceProdNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc deleted file mode 100644 index 0c6665494ece7..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/unsqueeze_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceSumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - bool reduce_all = ctx.Attr("reduce_all"); - bool keep_dims = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - out->mutable_data(ctx.GetPlace()); - - // special case - if (x->dims().size() == 1 && keep_dims == false) { - keep_dims = true; - } - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor cast_x; - phi::DenseTensor cast_out; - // NOTE: ReduceSumD only supports fp32 and fp16 - if (framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP32 && - framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP16) { - cast_x.Resize(x->dims()); - cast_x.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); - const auto& runner_cast = NpuOpRunner( - "Cast", {*x}, {cast_x}, {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - - cast_out.Resize(out->dims()); - cast_out.mutable_data(ctx.GetPlace()); - } else { - cast_x.ShareDataWith(*x); - cast_out.ShareDataWith(*out); - } - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - const auto& runner = - NpuOpRunner("ReduceSumD", - {cast_x}, - {cast_out}, - {{"axes", dim_vec}, {"keep_dims", keep_dims}}); - runner.Run(stream); - - } else { - const auto& runner = - NpuOpRunner("ReduceSumD", - {cast_x}, - {cast_out}, - {{"axes", dims}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - - if (framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP32 && - framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP16) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype())); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - } - } -}; - -template -class ReduceSumGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - bool reduce_all = ctx.Attr("reduce_all"); - bool keep_dims = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - x_grad->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - if (keep_dims || reduce_all) { - const auto& runner = NpuOpRunner("BroadcastToD", - {*out_grad}, - {*x_grad}, - {{"shape", phi::vectorize(x->dims())}}); - runner.Run(stream); - } else { - framework::DDim out_dims; - out_dims = UnsqueezeKernel::GetOutputShape( - dims, out_grad->dims()); - - phi::DenseTensor out_grad_tmp(out_grad->type()); - out_grad_tmp.Resize(out_dims); - out_grad_tmp.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *out_grad, - ctx.GetPlace(), - ctx.template device_context(), - &out_grad_tmp); - out_grad_tmp.Resize(out_dims); - - const auto& runner = NpuOpRunner("BroadcastToD", - {out_grad_tmp}, - {*x_grad}, - {{"shape", phi::vectorize(x->dims())}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - reduce_sum, - ops::ReduceSumNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceSumNPUKernel, -#endif - ops::ReduceSumNPUKernel, - ops::ReduceSumNPUKernel); -REGISTER_OP_NPU_KERNEL( - reduce_sum_grad, - ops::ReduceSumGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceSumGradNPUKernel, -#endif - ops::ReduceSumGradNPUKernel, - ops::ReduceSumGradNPUKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc deleted file mode 100644 index 3978923d46af7..0000000000000 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h" - -namespace paddle { -namespace operators { - -template -class SequenceMaskNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Output("Y"); - int maxlen = ctx.Attr("maxlen"); - - if (ctx.HasInput("MaxLenTensor")) { - auto max_len_tensor = ctx.Input("MaxLenTensor"); - PADDLE_ENFORCE_NOT_NULL(max_len_tensor, - platform::errors::InvalidArgument( - "Input(MaxLenTensor) should not be NULL." - "But received Input(MaxLenTensor) is NULL")); - phi::DenseTensor temp; - paddle::framework::TensorCopySync( - *max_len_tensor, platform::CPUPlace(), &temp); - maxlen = *temp.data(); - PADDLE_ENFORCE_GT( - maxlen, - 0, - platform::errors::InvalidArgument( - "Input(MaxLenTensor) value should be greater than 0. But " - "received Input(MaxLenTensor) value = %d.", - maxlen)); - } - - if (maxlen < 0) { - auto x_numel = x->numel(); - if (x_numel == 0) { - maxlen = 0; - } else { - std::vector x_vec; - framework::TensorToVector(*x, dev_ctx, &x_vec); - auto x_data = x_vec.data(); - maxlen = static_cast(*std::max_element(x_data, x_data + x_numel)); - } - } - auto y_dim = phi::vectorize(x->dims()); - y_dim.push_back(maxlen); - - phi::DenseTensor cast_x; - cast_x.mutable_data(x->dims(), ctx.GetPlace()); - const auto& cast1_runner = NpuOpRunner( - "Cast", - {*x}, - {cast_x}, - {{"dst_type", - ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}}); - cast1_runner.Run(dev_ctx.stream()); - - phi::DenseTensor tmp; - tmp.mutable_data(phi::make_ddim({maxlen}), ctx.GetPlace()); - NpuOpRunner range_runner; - range_runner.SetType("Range"); - range_runner.AddInput(std::vector({0})); - range_runner.AddInput(std::vector({maxlen})); - range_runner.AddInput(std::vector({1})); - range_runner.AddOutput(tmp); - range_runner.Run(dev_ctx.stream()); - - phi::DenseTensor expand_tmp; - expand_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& expand_runner = - NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}}); - expand_runner.Run(dev_ctx.stream()); - - auto x_dims = phi::vectorize(x->dims()); - x_dims.push_back(1); - cast_x.Resize(phi::make_ddim({x_dims})); - phi::DenseTensor x_tmp; - x_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& tile_runner = - NpuOpRunner("TileWithAxis", - {cast_x}, - {x_tmp}, - {{"axis", x->dims().size()}, {"tiles", maxlen}}); - tile_runner.Run(dev_ctx.stream()); - - phi::DenseTensor y_tmp; - y_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& less_runner = - NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {}); - less_runner.Run(dev_ctx.stream()); - - y->Resize(phi::make_ddim(y_dim)); - auto out_dtype = static_cast( - ctx.Attr("out_dtype")); - if (out_dtype == framework::proto::VarType::INT32) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::INT64) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::FP32) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::FP64) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::BOOL) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::UINT8) { - y->mutable_data(ctx.GetPlace()); - } else { - PADDLE_ENFORCE(false, - platform::errors::InvalidArgument( - "out_dtype only supporing int32, int64, fp32, fp64, " - "bool, uint8, but receive out_dtype is %d", - out_dtype)); - } - - const auto& cast2_runner = NpuOpRunner( - "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}}); - cast2_runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - sequence_mask, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel);