diff --git a/cmake/operators.cmake b/cmake/operators.cmake index d3d35cb3ae7e5..e068d2a9a5181 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -62,7 +62,6 @@ function(op_library TARGET) set(hip_cc_srcs) set(xpu_cc_srcs) set(xpu_kp_cc_srcs) - set(npu_cc_srcs) set(mlu_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) @@ -320,12 +319,7 @@ function(op_library TARGET) if(WITH_UNITY_BUILD AND op_library_UNITY) # Combine the cc source files. compose_unity_target_sources( - ${UNITY_TARGET} - cc - ${cc_srcs} - ${mkldnn_cc_srcs} - ${xpu_cc_srcs} - ${npu_cc_srcs} + ${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${mlu_cc_srcs}) if(TARGET ${UNITY_TARGET}) # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. @@ -339,12 +333,6 @@ function(op_library TARGET) endif() # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) - else() - cc_library( - ${TARGET} - SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} - ${mlu_cc_srcs} - DEPS ${op_library_DEPS} ${op_common_deps}) endif() endif() @@ -355,7 +343,6 @@ function(op_library TARGET) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) - list(LENGTH npu_cc_srcs npu_cc_srcs_len) list(LENGTH mlu_cc_srcs mlu_cc_srcs_len) # Define operators that don't need pybind here. @@ -590,7 +577,6 @@ function(register_operators) "*_op.cc") string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_xpu" "" OPS "${OPS}") - string(REPLACE "_npu" "" OPS "${OPS}") string(REPLACE "_mlu" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4955d47ca77e5..27842543c5902 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -100,12 +100,12 @@ register_operators(EXCLUDES py_func_op warpctc_op dgc_op generated_op1 generated recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS}) -op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS}) -target_link_libraries(run_program_op cuda_graph_with_memory_pool) + op_library(quantize_linear_op DEPS phi) op_library(save_combine_op DEPS string_array phi) op_library(load_combine_op DEPS string_array) + if (WITH_GPU OR WITH_ROCM) op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS}) elseif (WITH_XPU_KP) @@ -179,10 +179,7 @@ if (WITH_ASCEND) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper) endif() -if (WITH_ASCEND_CL) - cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS generated_static_op) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) -endif() + # FIXME(typhoonzero): operator deps may not needed. # op_library(unsqueeze_op DEPS reshape_op) @@ -218,18 +215,13 @@ if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) endif() -if (WITH_ASCEND_CL) - cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) - cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op) -endif() + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") add_subdirectory(benchmark) cc_test_old(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op ${COMMON_OP_DEPS}) -if (WITH_ASCEND_CL) - cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor) -endif() + if(WITH_MKLDNN) diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc deleted file mode 100644 index 0a859d1f564a9..0000000000000 --- a/paddle/fluid/operators/abs_op_npu.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AbsNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Abs", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class AbsGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - abs, - ops::AbsNPUKernel, - ops::AbsNPUKernel); - -REGISTER_OP_NPU_KERNEL( - abs_grad, - ops::AbsGradNPUKernel, - ops::AbsGradNPUKernel); diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc deleted file mode 100644 index 9f3392f2eabc5..0000000000000 --- a/paddle/fluid/operators/activation_op_npu.cc +++ /dev/null @@ -1,1116 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include -#include - -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class PowNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto factor = ctx.Attr("factor"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Power", - {*x}, - {*out}, - {{"power", factor}, - {"scale", static_cast(1.0)}, - {"shift", static_cast(0.0)}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class PowGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto factor = ctx.Attr("factor"); - - auto x_dims = x->dims(); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(liym27): dx = dout * factor * x.pow(factor-1) - - // Step1: Compute x_pow = x.pow(factor-1) - phi::DenseTensor x_pow(x->type()); - x_pow.mutable_data(x->dims(), place); - const auto& runner_pow = NpuOpRunner( - "Power", {*x}, {x_pow}, {{"power", factor - static_cast(1)}}); - runner_pow.Run(stream); - - // Step 2: Construct a broadcast factor, which has the same shape with x. - - // 2.1 Get a factor tensor with shape [1]. - phi::DenseTensor factor_tensor(phi::DataType::FLOAT32); - factor_tensor.mutable_data({1}, place); - FillNpuTensorWithConstant(&factor_tensor, factor); - - // 2.2 Get the factor which has the shape with x and the same value with - // factor. - phi::DenseTensor factor_bc_tensor(phi::DataType::FLOAT32); - factor_bc_tensor.mutable_data(x_dims, place); - const auto& runner_bc = NpuOpRunner("FillD", - {factor_tensor}, - {factor_bc_tensor}, - {{"dims", phi::vectorize(x_dims)}}); - runner_bc.Run(stream); - - // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) - phi::DenseTensor x_power_mul_factor(x->type()); - x_power_mul_factor.mutable_data(x->dims(), place); - const auto& runner_mul_1 = - NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); - runner_mul_1.Run(stream); - - // Step 4: Compute dx = dout * factor * x.pow(factor-1) - dx->mutable_data(place); - const auto& runner_mul_2 = - NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); - runner_mul_2.Run(stream); - } -}; - -template -class ReluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Relu", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ReluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); - - runner.Run(stream); - } -}; - -template -class Relu6NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Relu6", - { - *x, - }, - {*out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class Relu6GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {}); - - runner.Run(stream); - } -}; - -template -class SqrtNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class LeakyReluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto alpha = ctx.Attr("alpha"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("LeakyRelu", {*x}, {*out}, {{"negative_slope", alpha}}); - runner.Run(stream); - } -}; - -template -class LeakyReluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto alpha = ctx.Attr("alpha"); - - auto stream = - ctx.template device_context() - .stream(); - - dx->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "LeakyReluGrad", {*dout, *x}, {*dx}, {{"negative_slope", alpha}}); - - runner.Run(stream); - } -}; - -template -class SqrtGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class LogNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor one(x->type()); - one.mutable_data(x->dims(), place); - const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {}); - runner_one.Run(stream); - - phi::DenseTensor sub(x->type()); - sub.mutable_data(x->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {}); - runner_sub.Run(stream); - - const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {}); - runner_out.Run(stream); - } -}; - -template -class LogGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); - runner.Run(stream); - } -}; - -template -class TanhNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class TanhGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class SquareNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SquareGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto factor = static_cast(2.0); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - // Step 1: Compute x_muls_factor = factor * x - phi::DenseTensor x_muls_factor(x->type()); - x_muls_factor.mutable_data(x->dims(), place); - const auto& runner_muls_1 = - NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}}); - runner_muls_1.Run(stream); - - // Step 2: Compute dx = dout * factor * x - dx->mutable_data(place); - const auto& runner_mul_2 = - NpuOpRunner("Mul", {*dout, x_muls_factor}, {*dx}, {}); - runner_mul_2.Run(stream); - } -}; - -template -class SigmoidNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sigmoid", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SigmoidGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = - NpuOpRunner("SigmoidGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -// Swish = x * sigmoid(beta * x) -template -class SwishNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - float beta = ctx.Attr("beta"); - - out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - - const auto& muls_runner = - NpuOpRunner("Muls", {*x}, {*out}, {{"value", beta}}); - muls_runner.Run(stream); - - const auto& sigmoid_runner = NpuOpRunner("Sigmoid", {*out}, {*out}, {}); - sigmoid_runner.Run(stream); - - const auto& mul_runner = NpuOpRunner("Mul", {*x, *out}, {*out}); - mul_runner.Run(stream); - } -}; - -template -class SwishGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - float beta = ctx.Attr("beta"); - - dx->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor beta_x, sigmoid_out, swish_out; - beta_x.mutable_data(x->dims(), ctx.GetPlace()); - sigmoid_out.mutable_data(x->dims(), ctx.GetPlace()); - swish_out.mutable_data(x->dims(), ctx.GetPlace()); - const auto& muls_runner = - NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}}); - muls_runner.Run(stream); - - const auto& sigmoid_runner = - NpuOpRunner("Sigmoid", {beta_x}, {sigmoid_out}, {}); - sigmoid_runner.Run(stream); - - const auto& mul_runner = - NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {}); - mul_runner.Run(stream); - const auto& muls_runner2 = - NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}}); - muls_runner2.Run(stream); - - const auto& mul_runner1 = - NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {}); - mul_runner1.Run(stream); - - const auto& sub_runner = NpuOpRunner("Sub", {swish_out, *dx}, {*dx}, {}); - sub_runner.Run(stream); - - const auto& add_runner = NpuOpRunner("Add", {sigmoid_out, *dx}, {*dx}, {}); - add_runner.Run(stream); - - const auto& mul_runner2 = NpuOpRunner("Mul", {*dout, *dx}, {*dx}, {}); - mul_runner2.Run(stream); - } -}; - -// HardSwish = min(max(0, x+offset), threshold) * x / scale -template -class HardSwishNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - float threshold = ctx.Attr("threshold"); - float scale = ctx.Attr("scale"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_offset(x->type()); - tensor_offset.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); - - phi::DenseTensor add_offset_val(x->type()); - add_offset_val.mutable_data(x->dims(), place); - const auto& runner_add = - NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); - runner_add.Run(stream); - - phi::DenseTensor tensor_threshold(x->type()); - tensor_threshold.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_threshold, static_cast(threshold)); - - phi::DenseTensor tensor_zero(x->type()); - tensor_zero.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_zero, static_cast(0.0)); - - phi::DenseTensor clip_val(x->type()); - clip_val.mutable_data(x->dims(), place); - const auto& runner_clip = - NpuOpRunner("ClipByValue", - {add_offset_val, tensor_zero, tensor_threshold}, - {clip_val}); - runner_clip.Run(stream); - - phi::DenseTensor tensor_scale_tmp(x->type()); - tensor_scale_tmp.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_scale_tmp, static_cast(scale)); - phi::DenseTensor tensor_scale(x->type()); - tensor_scale.mutable_data(x->dims(), place); - const auto& runner_fill = - NpuOpRunner("FillD", - {tensor_scale_tmp}, - {tensor_scale}, - {{"dims", phi::vectorize(x->dims())}}); - runner_fill.Run(stream); - - phi::DenseTensor div_val(x->type()); - div_val.mutable_data(x->dims(), place); - const auto& runner_div = - NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val}); - runner_div.Run(stream); - - const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out}); - runner_mul.Run(stream); - } -}; - -template -class HardSwishGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - float threshold = ctx.Attr("threshold"); - float scale = ctx.Attr("scale"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_offset(x->type()); - tensor_offset.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_offset, static_cast(offset)); - - phi::DenseTensor add_offset_val(x->type()); - add_offset_val.mutable_data(x->dims(), place); - const auto& runner_add = - NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val}); - runner_add.Run(stream); - - phi::DenseTensor tmp1(x->type()); - tmp1.mutable_data(x->dims(), place); - const auto& runner_pow1 = NpuOpRunner( - "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}}); - runner_pow1.Run(stream); - - phi::DenseTensor tmp2(x->type()); - tmp2.mutable_data(x->dims(), place); - const auto& runner_ht_grad = - NpuOpRunner("HardtanhGrad", - {add_offset_val, tmp1}, - {tmp2}, - {{"min_val", 0.0f}, {"max_val", threshold}}); - runner_ht_grad.Run(stream); - - phi::DenseTensor tmp3(x->type()); - tmp3.mutable_data(x->dims(), place); - const auto& runner_pow2 = NpuOpRunner( - "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}}); - runner_pow2.Run(stream); - - phi::DenseTensor tensor_threshold_tmp(x->type()); - tensor_threshold_tmp.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_threshold_tmp, - static_cast(threshold)); - phi::DenseTensor tensor_threshold(x->type()); - tensor_threshold.mutable_data(x->dims(), place); - const auto& runner_fill = - NpuOpRunner("FillD", - {tensor_threshold_tmp}, - {tensor_threshold}, - {{"dims", phi::vectorize(x->dims())}}); - runner_fill.Run(stream); - - phi::DenseTensor tmp_bool(phi::DataType::BOOL); - tmp_bool.mutable_data(x->dims(), place); - const auto& runner_less = - NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool}); - runner_less.Run(stream); - phi::DenseTensor tmp4(x->type()); - tmp4.mutable_data(x->dims(), place); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast = - NpuOpRunner("Cast", - {tmp_bool}, - {tmp4}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - - phi::DenseTensor tmp5(x->type()); - tmp5.mutable_data(x->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5}); - runner_sub.Run(stream); - - const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx}); - runner_final.Run(stream); - } -}; - -template -class HardSigmoidNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - float slope = ctx.Attr("slope"); - float offset = ctx.Attr("offset"); - - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {{"alpha", slope}, - {"beta", offset}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("HardSigmoid", {*x}, {*out}, attr_input); - runner.Run(stream); - } -}; - -template -class HardSigmoidGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - float slope = ctx.Attr("slope"); - float offset = ctx.Attr("offset"); - - dx->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {{"alpha", slope}, - {"beta", offset}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = - NpuOpRunner("HardSigmoidGrad", {*dout, *out}, {*dx}, attr_input); - runner_dx.Run(stream); - } -}; - -template -class ReciprocalNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - out->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Reciprocal", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class ReciprocalGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto place = ctx.GetPlace(); - dx->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_dx = - NpuOpRunner("ReciprocalGrad", {*out, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class CosNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class CosGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - dx->mutable_data(place); - - phi::DenseTensor sin_out(x->type()); // Temporary phi::DenseTensor - sin_out.Resize(x->dims()); - sin_out.mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {}); - runner.Run(stream); - - const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {}); - runner_dx.Run(stream); - - phi::DenseTensor tmp(x->type()); // Temporary phi::DenseTensor - tmp.Resize(phi::make_ddim({1, 1})); - tmp.mutable_data(place); - float factor = -1.; - FillNpuTensorWithConstant(&tmp, static_cast(factor)); - - const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {}); - runner_dx_.Run(stream); - // dx = -dout * Sine(x); - } -}; - -template -class AtanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - out->mutable_data(place); - const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class AtanGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto place = ctx.GetPlace(); - dx->mutable_data(place); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -template -class ExpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ExpGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("Mul", {*dout, *out}, {*dx}, {}); - runner.Run(stream); - } -}; - -template -class SinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Sin", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - pow, - ops::PowNPUKernel, - ops::PowNPUKernel); - -REGISTER_OP_NPU_KERNEL( - pow_grad, - ops::PowGradNPUKernel, - ops::PowGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu, - ops::ReluNPUKernel, - ops::ReluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu_grad, - ops::ReluGradNPUKernel, - ops::ReluGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu6, - ops::Relu6NPUKernel, - ops::Relu6NPUKernel); - -REGISTER_OP_NPU_KERNEL( - relu6_grad, - ops::Relu6GradNPUKernel, - ops::Relu6GradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - leaky_relu, - ops::LeakyReluNPUKernel, - ops::LeakyReluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - leaky_relu_grad, - ops::LeakyReluGradNPUKernel, - ops::LeakyReluGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sqrt, - ops::SqrtNPUKernel, - ops::SqrtNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sqrt_grad, - ops::SqrtGradNPUKernel, - ops::SqrtGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - log, - ops::LogNPUKernel, - ops::LogNPUKernel); - -REGISTER_OP_NPU_KERNEL( - log_grad, - ops::LogGradNPUKernel, - ops::LogGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - tanh, - ops::TanhNPUKernel, - ops::TanhNPUKernel); - -REGISTER_OP_NPU_KERNEL( - tanh_grad, - ops::TanhGradNPUKernel, - ops::TanhGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - square, - ops::SquareNPUKernel, - ops::SquareNPUKernel, - ops::SquareNPUKernel); - -REGISTER_OP_NPU_KERNEL( - square_grad, - ops::SquareGradNPUKernel, - ops::SquareNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sigmoid, - ops::SigmoidNPUKernel, - ops::SigmoidNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sigmoid_grad, - ops::SigmoidGradNPUKernel, - ops::SigmoidGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(swish, - ops::SwishNPUKernel, - ops::SwishNPUKernel); - -REGISTER_OP_NPU_KERNEL(swish_grad, - ops::SwishGradNPUKernel, - ops::SwishGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(hard_swish, - ops::HardSwishNPUKernel, - ops::HardSwishNPUKernel); - -REGISTER_OP_NPU_KERNEL(hard_swish_grad, - ops::HardSwishGradNPUKernel, - ops::HardSwishGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - hard_sigmoid, - ops::HardSigmoidNPUKernel, - ops::HardSigmoidNPUKernel); - -REGISTER_OP_NPU_KERNEL( - hard_sigmoid_grad, - ops::HardSigmoidGradNPUKernel, - ops::HardSigmoidGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - reciprocal, - ops::ReciprocalNPUKernel, - ops::ReciprocalNPUKernel, - ops::ReciprocalNPUKernel); - -REGISTER_OP_NPU_KERNEL( - reciprocal_grad, - ops::ReciprocalGradNPUKernel, - ops::ReciprocalGradNPUKernel, - ops::ReciprocalGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - cos, - ops::CosNPUKernel, - ops::CosNPUKernel); - -REGISTER_OP_NPU_KERNEL( - cos_grad, - ops::CosGradNPUKernel, - ops::CosGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - atan, - ops::AtanNPUKernel, - ops::AtanNPUKernel); - -REGISTER_OP_NPU_KERNEL( - atan_grad, - ops::AtanGradNPUKernel, - ops::AtanGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - exp, - ops::ExpNPUKernel, - ops::ExpNPUKernel); - -REGISTER_OP_NPU_KERNEL( - exp_grad, - ops::ExpGradNPUKernel, - ops::ExpGradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - sin, - ops::SinNPUKernel, - ops::SinNPUKernel, - ops::SinNPUKernel); diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index cbedb02f86836..cbd9c8b2768b4 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -4,11 +4,3 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() - -if(WITH_ASCEND_CL) - cc_test( - check_finite_and_unscale_op_npu_test - SRCS check_finite_and_unscale_op_npu_test.cc - DEPS op_registry check_finite_and_unscale_op scope device_context enforce - executor) -endif() diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc deleted file mode 100644 index 424c2326ab201..0000000000000 --- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AllocFloatStatusKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* float_status = ctx.Output("FloatStatus"); - float_status->mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - alloc_float_status, - ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc deleted file mode 100644 index 63e16fb357058..0000000000000 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA. -// On NPU, we do not really check the data of input tensors, -// but use NPUGetFloatStatus to check whether the nan/inf occurs on device, -// and clear it after this op. -// Which may leads to wrong result if the input tensors is not calculated -// on NPU device, but got from other way, for example, feeding. -template -class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const auto xs = ctx.MultiInput("X"); - const auto* scale = ctx.Input("Scale"); - const auto* float_status = ctx.Input("FloatStatus"); - auto outs = ctx.MultiOutput("Out"); - auto* found_inf = ctx.Output("FoundInfinite"); - - found_inf->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - // step1: inverse scale - phi::DenseTensor const_tensor; - const_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); - - // Inverse(1.0/scale) - phi::DenseTensor* tmp_inverse_out = const_cast(scale); - phi::DenseTensor inverse_out(scale->type()); - inverse_out.Resize(scale->dims()); - inverse_out.mutable_data(ctx.GetPlace()); - const auto& runner_inverse = - NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); - runner_inverse.Run(stream); - tmp_inverse_out = &inverse_out; - - // NOTE(zhiqiu): - phi::DenseTensor tmp; - tmp.mutable_data({8}, ctx.GetPlace()); - // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. - // tmp is only placeholder. - const auto& runner_float_status = - NpuOpRunner("NPUGetFloatStatus", - {*float_status}, - {tmp}, - {{"message", std::string("check_nan_and_inf")}}); - runner_float_status.Run(stream); - - phi::DenseTensor sum; - sum.mutable_data({1}, ctx.GetPlace()); - const auto& runner_reduce_sum = - NpuOpRunner("ReduceSumD", - {*float_status}, - {sum}, - {{"axes", std::vector{0}}, {"keep_dims", true}}); - runner_reduce_sum.Run(stream); - - const auto& runner_greater = - NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {}); - runner_greater.Run(stream); - - // NOTE(zhiqiu): The normal logic is : - // out = in, if found_inf = true - // out = in/scale, if found_inf = false - // However, on NPU, in order to avoid stream sync, we do not copy the - // found_inf data to cpu to check whether to unscale or not. - // Instead, we do the Mul no matter found_inf or not. - // And, a fact is, only few steps contains nan/inf during training. - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - out->mutable_data(ctx.GetPlace()); - const auto& runner_mul = - NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); - runner_mul.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(check_finite_and_unscale, - ops::CheckFiniteAndUnscaleNPUKernel, - ops::CheckFiniteAndUnscaleNPUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc deleted file mode 100644 index bf7272ba8b878..0000000000000 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(check_finite_and_unscale); -USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); - -struct InputVars { - std::string name; - phi::DenseTensor *tensor; -}; - -template -void Compare(f::Scope *scope, const p::DeviceContext &ctx) { - const f::DDim dims = phi::make_ddim({2, 2}); - auto place = ctx.GetPlace(); - - // init input - std::vector input_names = { - {"x", scope->Var("x")->GetMutable()}, - {"x1", scope->Var("x1")->GetMutable()}}; - - auto *scale = scope->Var("scale")->GetMutable(); - - // init output - auto *out = scope->Var("out")->GetMutable(); - auto *out1 = scope->Var("out1")->GetMutable(); - auto *found_inf = scope->Var("found_inf")->GetMutable(); - - // Initialize input data - const int num_inputs = input_names.size(); - size_t numel = static_cast(phi::product(dims)); - - for (int i = 0; i < num_inputs; ++i) { - std::vector init_xs; - for (size_t j = 0; j < numel; ++j) { - if (j == 0) { - init_xs.push_back(static_cast(NAN)); - } else { - init_xs.push_back(static_cast(j + 1)); - } - } - f::TensorFromVector(init_xs, ctx, input_names[i].tensor); - input_names[i].tensor->Resize(dims); - } - - f::TensorFromVector(std::vector{static_cast(0.5)}, ctx, scale); - - ctx.Wait(); - - // run - f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp( - "check_finite_and_unscale", - {{"X", {"x", "x1"}}, {"Scale", {"scale"}}}, - {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, - attrs); - op->Run(*scope, place); - ctx.Wait(); - - // out0 - std::vector out_vec; - f::TensorToVector(*out, ctx, &out_vec); - EXPECT_EQ(out_vec.size(), static_cast(4)); - for (size_t j = 0; j < out_vec.size(); ++j) { - VLOG(3) << "out_vec[" << j << "]:" << out_vec[j]; - } - - ctx.Wait(); - - // out0 - std::vector out1_vec; - f::TensorToVector(*out1, ctx, &out1_vec); - EXPECT_EQ(out1_vec.size(), static_cast(4)); - for (size_t j = 0; j < out1_vec.size(); ++j) { - VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j]; - } - - ctx.Wait(); - - // out found_inf - phi::DenseTensor found_inf_tensor; - found_inf_tensor.Resize({1}); - bool *found_inf_data = - found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); - f::TensorCopy(*found_inf, place, &found_inf_tensor); - EXPECT_TRUE(*found_inf_data); - - ctx.Wait(); -} - -TEST(check_finite_and_unscale, NPU_fp32) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(check_finite_and_unscale, NPU_fp16) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc deleted file mode 100644 index 1f3e54421f020..0000000000000 --- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ClearFloatStatusKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* float_status = ctx.Input("FloatStatus"); - auto* float_status_out = ctx.Output("FloatStatusOut"); - // NOTE(zhiqiu): NPUClearFloatStatus modifies the input. - PADDLE_ENFORCE_EQ(float_status_out, - float_status, - platform::errors::PreconditionNotMet( - "The input(FloatStatus) and Output(FloatStatusOut) " - "should be the same.")); - phi::DenseTensor tmp; - tmp.mutable_data({8}, ctx.GetPlace()); - const auto& runner = - NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - clear_float_status, - ops::ClearFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc deleted file mode 100644 index 5d8f88cc85f26..0000000000000 --- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class GetFloatStatusKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* float_status = ctx.Input("FloatStatus"); - auto* float_status_out = ctx.Output("FloatStatusOut"); - // GetClearFloatStatus modifies the input. - PADDLE_ENFORCE_EQ(float_status_out, - float_status, - platform::errors::PreconditionNotMet( - "The input(FloatStatus) and Output(FloatStatusOut) " - "should be the same.")); - phi::DenseTensor tmp; - tmp.mutable_data({8}, ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - // NPUGetFloatStatus updates data on input in-place. - // tmp is only placeholder. - NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}).Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - get_float_status, - ops::GetFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc deleted file mode 100644 index d4565c1780928..0000000000000 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ /dev/null @@ -1,293 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" - -DECLARE_int32(min_loss_scaling); - -namespace paddle { -namespace operators { - -template -void Update(const platform::NPUDeviceContext& ctx, - const std::vector found_inf_vec, - const phi::DenseTensor* pre_loss_scaling_tensor, - const phi::DenseTensor* good_in_tensor, - const phi::DenseTensor* bad_in_tensor, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - phi::DenseTensor* updated_loss_scaling_tensor, - phi::DenseTensor* good_out_tensor, - phi::DenseTensor* bad_out_tensor) { - auto place = ctx.GetPlace(); - auto stream = ctx.stream(); - if (found_inf_vec[0]) { - // good_out_data = 0 - auto g = good_out_tensor->mutable_data(place); - platform::NPUMemsetAsync(static_cast(g), - 0, - good_out_tensor->numel() * sizeof(int), - stream); - // bad_out_data = bad_in_data + 1 - phi::DenseTensor factor_tensor(bad_out_tensor->dtype()); - factor_tensor.mutable_data({1}, place); - FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); - const auto& runner_p2 = NpuOpRunner( - "Add", {*bad_in_tensor, factor_tensor}, {*bad_out_tensor}, {}); - runner_p2.Run(stream); - - std::vector bad_out_data; - paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data); - if (bad_out_data[0] >= decr_every_n_nan_or_inf) { - const auto& runner_p3 = NpuOpRunner("Power", - {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", decr_ratio}, - {"shift", static_cast(0)}}); - - runner_p3.Run(stream); - - std::vector new_loss_scaling; - paddle::framework::TensorToVector( - *updated_loss_scaling_tensor, ctx, &new_loss_scaling); - float min_value = 1.0; - if (FLAGS_min_loss_scaling > 1) { - min_value = static_cast(FLAGS_min_loss_scaling); - } - - if (new_loss_scaling[0] < min_value) { - // updated_loss_scaling_data = 1 - const auto& runner_p4 = - NpuOpRunner("Power", - {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(min_value)}}); - - runner_p4.Run(stream); - } - - // bad_out_data = 0 - auto b = bad_out_tensor->mutable_data(place); - platform::NPUMemsetAsync(static_cast(b), - 0, - bad_out_tensor->numel() * sizeof(int), - stream); - } - } else { - // bad_out_data = 0 - auto b = bad_out_tensor->mutable_data(place); - platform::NPUMemsetAsync(static_cast(b), - 0, - bad_out_tensor->numel() * sizeof(int), - stream); - - // good_out_data = good_in_data + 1 - phi::DenseTensor factor_tensor(good_out_tensor->dtype()); - factor_tensor.mutable_data({1}, place); - FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); - const auto& runner_p2 = NpuOpRunner( - "Add", {*good_in_tensor, factor_tensor}, {*good_out_tensor}, {}); - runner_p2.Run(stream); - - std::vector good_out_data; - paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data); - - if (good_out_data[0] >= incr_every_n_steps) { - const auto& runner_p3 = NpuOpRunner("Power", - {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", incr_ratio}, - {"shift", static_cast(0)}}); - runner_p3.Run(stream); - - std::vector new_loss_scaling; - paddle::framework::TensorToVector( - *updated_loss_scaling_tensor, ctx, &new_loss_scaling); - if (!std::isfinite(new_loss_scaling[0])) { - // updated_loss_scaling_data = pre_loss_scaling_data - const auto& runner_p4 = NpuOpRunner("Power", - {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(1)}, - {"shift", static_cast(0)}}); - - runner_p4.Run(stream); - } - // good_out_data = 0 - auto g = good_out_tensor->mutable_data(place); - platform::NPUMemsetAsync(static_cast(g), - 0, - good_out_tensor->numel() * sizeof(int), - stream); - } - } -} - -template -class UpdateLossScalingFunctor { - public: - void operator()(const platform::NPUDeviceContext& dev_ctx, - const std::vector found_inf_vec, - const phi::DenseTensor* pre_loss_scaling_tensor, - const phi::DenseTensor* good_in_tensor, - const phi::DenseTensor* bad_in_tensor, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - phi::DenseTensor* updated_loss_scaling_tensor, - phi::DenseTensor* good_out_tensor, - phi::DenseTensor* bad_out_tensor) const { - Update(dev_ctx, - found_inf_vec, - pre_loss_scaling_tensor, - good_in_tensor, - bad_in_tensor, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_tensor, - good_out_tensor, - bad_out_tensor); - } -}; - -template -class LazyZerosNPU { - public: - void operator()(const platform::NPUDeviceContext& dev_ctx, - const std::vector found_inf_vec, - const std::vector& xs, - const std::vector& outs) const { - if (!xs.size()) { - return; - } - auto place = dev_ctx.GetPlace(); - auto stream = dev_ctx.stream(); - phi::DenseTensor* zero_tensor = nullptr; - void* zero_ptr = nullptr; - if (found_inf_vec[0]) { - int max_num = -1; - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - int num = out->numel(); - if (max_num < num) { - max_num = num; - zero_tensor = out; - } - } - - zero_tensor->mutable_data(place); - const auto& runner_zeros = - NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor}); - runner_zeros.Run(stream); - zero_tensor->check_memory_size(); - zero_ptr = zero_tensor->data(); - } - - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - auto* x = xs[i]; - auto dst_ptr = out->mutable_data(place); - if (!found_inf_vec[0]) { - framework::TensorCopy(*x, place, dev_ctx, out); - } else if (zero_ptr != dst_ptr) { - auto size = out->numel() * phi::SizeOf(out->dtype()); - memory::Copy(place, dst_ptr, place, zero_ptr, size, stream); - } - } - } -}; - -template -class UpdateLossScalingNPUKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - - const auto xs = ctx.MultiInput("X"); - auto outs = ctx.MultiOutput("Out"); - const auto* found_inf = ctx.Input("FoundInfinite"); - PADDLE_ENFORCE_EQ(found_inf->numel(), - 1, - platform::errors::InvalidArgument( - "FoundInfinite must has only one element.")); - - std::vector found_inf_vec; - paddle::framework::TensorToVector( - *found_inf, ctx.device_context(), &found_inf_vec); - - LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); - const bool stop_update = ctx.Attr("stop_update"); - if (stop_update) { - return; - } - - const auto* pre_loss_scaling = - ctx.Input("PrevLossScaling"); - const auto* good_in = ctx.Input("InGoodSteps"); - const auto* bad_in = ctx.Input("InBadSteps"); - auto* updated_loss_scaling = ctx.Output("LossScaling"); - auto* good_out = ctx.Output("OutGoodSteps"); - auto* bad_out = ctx.Output("OutBadSteps"); - - updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); - good_out->mutable_data(dev_ctx.GetPlace()); - bad_out->mutable_data(dev_ctx.GetPlace()); - - const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); - const int decr_every_n_nan_or_inf = - ctx.Attr("decr_every_n_nan_or_inf"); - const float incr_ratio = ctx.Attr("incr_ratio"); - const float decr_ratio = ctx.Attr("decr_ratio"); - UpdateLossScalingFunctor{}(dev_ctx, - found_inf_vec, - pre_loss_scaling, - good_in, - bad_in, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling, - good_out, - bad_out); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - update_loss_scaling, - ops::UpdateLossScalingNPUKernel, - ops::UpdateLossScalingNPUKernel); diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc deleted file mode 100644 index 014fb09474936..0000000000000 --- a/paddle/fluid/operators/arg_max_op_npu.cc +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -struct VisitDataArgNPUMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgNPUMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto dtype = ctx.Attr("dtype"); - const bool& flatten = ctx.Attr("flatten"); - - phi::DenseTensor transformed_x(x.type()); - transformed_x.ShareDataWith(x); - if (flatten) { - transformed_x.Resize(phi::make_ddim({x.numel()})); - } - - auto stream = ctx.template device_context().stream(); - NpuOpRunner runner; - runner.SetType("ArgMaxV2") - .AddInput(transformed_x) - .AddInput(std::vector{axis}) - .AddOutput(out) - .AddAttrDataType("dtype", dtype) - .Run(stream); - } -}; - -template -class ArgMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny(static_cast( - framework::proto::VarType::INT64), - VisitDataArgNPUMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgNPUMaxFunctor(ctx)); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(arg_max, - ops::ArgMaxNPUKernel, - ops::ArgMaxNPUKernel); diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc deleted file mode 100644 index e601efd2d37e1..0000000000000 --- a/paddle/fluid/operators/arg_min_op_npu.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.h" - -namespace paddle { -namespace operators { - -template -class ArgMinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int64_t axis = ctx.Attr("axis"); - auto dtype = ctx.Attr("dtype"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - NpuOpRunner runner; - runner.SetType("ArgMin") - .AddInput(*x) - .AddInput(std::vector{axis}) - .AddOutput(*out) - .AddAttr("dtype", dtype); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - arg_min, - ops::ArgMinNPUKernel, - ops::ArgMinNPUKernel); diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc deleted file mode 100644 index 18915ee4f3d79..0000000000000 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ /dev/null @@ -1,286 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void TranposeNPU(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - std::vector* perm, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(in) - .AddInput(std::move(*perm)) - .AddOutput(*out) - .Run(stream); -} - -static void CastToInt64(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_INT64) - .Run(stream); -} - -static void CastToFP32(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT) - .Run(stream); -} - -template -class ArgsortNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - auto stream = ctx.template device_context().stream(); - framework::NPUAttributeMap attr = {{"axis", -1}, - {"descending", descending}}; - - phi::DenseTensor indices_tmp(phi::DataType::INT32); - indices_tmp.Resize(indices->dims()); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::INT64) { - phi::DenseTensor input_fp32(phi::DataType::FLOAT32); - input_fp32.Resize(input->dims()); - CastToFP32(ctx, stream, *input, &input_fp32); - - phi::DenseTensor output_fp32(phi::DataType::FLOAT32); - output_fp32.Resize(output->dims()); - - if (axis == -1 || axis + 1 == in_dims.size()) { - output_fp32.mutable_data(ctx.GetPlace()); - indices_tmp.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr); - runner.Run(stream); - - CastToInt64(ctx, stream, output_fp32, output); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_input(input_fp32.type()); - trans_input.Resize(trans_dims); - TranposeNPU(ctx, stream, &perm, input_fp32, &trans_input); - - phi::DenseTensor trans_output(input_fp32.type()); - phi::DenseTensor trans_indices(phi::DataType::INT32); - trans_output.mutable_data(trans_dims, ctx.GetPlace()); - trans_indices.mutable_data(trans_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "Sort", {trans_input}, {trans_output, trans_indices}, attr); - runner.Run(stream); - - TranposeNPU(ctx, stream, &perm, trans_output, &output_fp32); - TranposeNPU(ctx, stream, &perm, trans_indices, &indices_tmp); - - CastToInt64(ctx, stream, output_fp32, output); - } - } else { - if (axis == -1 || axis + 1 == in_dims.size()) { - output->mutable_data(ctx.GetPlace()); - indices_tmp.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr); - runner.Run(stream); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_input(input->type()); - trans_input.Resize(trans_dims); - TranposeNPU(ctx, stream, &perm, *input, &trans_input); - - phi::DenseTensor trans_output(input->type()); - phi::DenseTensor trans_indices(phi::DataType::INT32); - trans_output.mutable_data(trans_dims, ctx.GetPlace()); - trans_indices.mutable_data(trans_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "Sort", {trans_input}, {trans_output, trans_indices}, attr); - runner.Run(stream); - - TranposeNPU(ctx, stream, &perm, trans_output, output); - TranposeNPU(ctx, stream, &perm, trans_indices, &indices_tmp); - } - } - - CastToInt64(ctx, stream, indices_tmp, indices); - } -}; - -template -static void FullAssignNPU(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const framework::DDim in_dims, - const phi::DenseTensor& input, - const phi::DenseTensor& indices, - phi::DenseTensor* t_out) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - phi::DenseTensor input_tmp; - input_tmp.ShareDataWith(input); - input_tmp.Resize( - phi::make_ddim(std::vector{input_height * input_width})); - - phi::DenseTensor indices_tmp; - indices_tmp.ShareDataWith(indices); - indices_tmp.Resize( - phi::make_ddim(std::vector{input_height, input_width})); - - std::vector indexs_value; - for (Type i = 0; i < input_height; i++) { - indexs_value.push_back(i * input_width); - } - phi::DenseTensor indexs_tmp(indices.type()); - framework::TensorFromVector( - indexs_value, ctx.device_context(), &indexs_tmp); - indexs_tmp.Resize(phi::make_ddim(std::vector{input_height, 1})); - - phi::DenseTensor indices_index(indices.type()); - indices_index.mutable_data(indices_tmp.dims(), ctx.GetPlace()); - const auto& runner_add = - NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {}); - runner_add.Run(stream); - - indices_index.Resize( - phi::make_ddim(std::vector{input_height * input_width})); - - t_out->mutable_data(ctx.GetPlace()); - phi::DenseTensor out_tmp(t_out->type()); - out_tmp.ShareDataWith(*t_out); - - const auto& runner = NpuOpRunner("TensorScatterUpdate", - {input_tmp, indices_index, input_tmp}, - {out_tmp}, - {}); - runner.Run(stream); -} - -template -class ArgsortGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (dO->numel() == 0) return; - - auto stream = ctx.template device_context().stream(); - - if (axis == -1 || axis + 1 == in_dims.size()) { - FullAssignNPU(ctx, stream, in_dims, *dO, *indices, dX); - } else { - std::vector perm; - for (int64_t i = 0; i < in_dims.size(); i++) { - perm.emplace_back(i); - } - std::swap(perm[axis], perm[in_dims.size() - 1]); - - std::vector shape; - for (size_t i = 0; i < perm.size(); i++) { - shape.emplace_back(in_dims[perm[i]]); - } - auto trans_dims = phi::make_ddim(shape); - - phi::DenseTensor trans_dout(dO->type()); - phi::DenseTensor trans_ids(indices->type()); - trans_dout.Resize(trans_dims); - trans_ids.Resize(trans_dims); - - TranposeNPU(ctx, stream, &perm, *dO, &trans_dout); - TranposeNPU(ctx, stream, &perm, *indices, &trans_ids); - - phi::DenseTensor trans_dx(dO->type()); - trans_dx.Resize(trans_dims); - FullAssignNPU( - ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx); - - TranposeNPU(ctx, stream, &perm, trans_dx, dX); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(argsort, - ops::ArgsortNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ArgsortNPUKernel, -#endif - ops::ArgsortNPUKernel); - -REGISTER_OP_NPU_KERNEL(argsort_grad, - ops::ArgsortGradNPUKernel, - ops::ArgsortGradNPUKernel); diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc deleted file mode 100644 index ff88427c12336..0000000000000 --- a/paddle/fluid/operators/assign_op_npu.cc +++ /dev/null @@ -1,58 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/assign_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace framework { -class OpDesc; -class Variable; -} // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -} // namespace paddle - -namespace paddle { -namespace operators { -template -class AssignNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - assign, - ops::AssignNPUKernel, - ops::AssignNPUKernel, - ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc deleted file mode 100644 index 25d8d07802ad1..0000000000000 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(assign); -USE_OP_DEVICE_KERNEL(assign, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - init.push_back(static_cast(1.0)); - init.push_back(static_cast(2.0)); - init.push_back(static_cast(3.0)); - init.push_back(static_cast(4.0)); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({4}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - auto op = - f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); - EXPECT_EQ(out_vec[0], static_cast(1.0)); - EXPECT_EQ(out_vec[1], static_cast(2.0)); - EXPECT_EQ(out_vec[2], static_cast(3.0)); - EXPECT_EQ(out_vec[3], static_cast(4.0)); -} - -TEST(assign, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "assign"); -} diff --git a/paddle/fluid/operators/assign_value_op_npu.cc b/paddle/fluid/operators/assign_value_op_npu.cc deleted file mode 100644 index 5354f26d6fa73..0000000000000 --- a/paddle/fluid/operators/assign_value_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/assign_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(assign_value, - ops::AssignValueKernel, - ops::AssignValueKernel, - ops::AssignValueKernel, - ops::AssignValueKernel); diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc deleted file mode 100644 index 15774d5712fff..0000000000000 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ /dev/null @@ -1,261 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/batch_norm_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class NPUBatchNormOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - - bool test_mode = is_test && (!trainable_stats); - bool training = !test_mode && !use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - (x_dims.size() == 4UL || x_dims.size() == 3UL), - true, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 3 or 4. " - " But got X's shape = [%s], X's dimension = [%d].", - x_dims.to_str(), - x_dims.size())); - - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - auto x_tensor = - ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto y_tesnor = - ctx.AllocateTmpTensor(y->dims(), dev_ctx); - x_tensor.ShareDataWith(*x); - y_tesnor.ShareDataWith(*y); - if (data_layout == DataLayout::kNHWC) { - x_tensor.set_layout(DataLayout::kNHWC); - y_tesnor.set_layout(DataLayout::kNHWC); - } - - auto stream = ctx.template device_context().stream(); - if (!training) { - const auto &runner_infer = - NpuOpRunner("BNInfer", - {x_tensor, *scale, *bias, *running_mean, *running_var}, - {y_tesnor}, - {{"epsilon", epsilon}}); - runner_infer.Run(stream); - } else { - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - phi::DenseTensor mom_cpu; - paddle::framework::TensorCopySync( - *mom_tensor, platform::CPUPlace(), &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - phi::DenseTensor sum, square_sum; - sum.mutable_data(running_mean->dims(), ctx.GetPlace()); - square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); - - // BNTrainingReduce ONLY support rank = 4 - if (x->dims().size() == 3) { - auto x_shape_vec = phi::vectorize(x->dims()); - if (data_layout == DataLayout::kNCHW) { - x_shape_vec.push_back(1); // expand NCL -> NCL1 - } else { - x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C - } - auto x_new_shape = phi::make_ddim(x_shape_vec); - x_tensor.Resize(x_new_shape); - x_tensor.Resize(x_new_shape); - } - const auto &runner_reduce = NpuOpRunner("BNTrainingReduce", - {x_tensor}, - {sum, square_sum}, - {{"epsilon", epsilon}}); - runner_reduce.Run(stream); - - const auto &runner_update = NpuOpRunner( - "BNTrainingUpdate", - {x_tensor, - sum, - square_sum, - *scale, - *bias, - *running_mean, - *running_var}, - {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance}, - {{"factor", momentum}, {"epsilon", epsilon}}); - runner_update.Run(stream); - } - } -}; - -template -class NPUBatchNormGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = - ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - auto &dev_ctx = ctx.template device_context(); - auto x_tensor = - ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto dy_tensor = - ctx.AllocateTmpTensor(d_y->dims(), dev_ctx); - x_tensor.ShareDataWith(*x); - dy_tensor.ShareDataWith(*d_y); - if (data_layout == DataLayout::kNHWC) { - x_tensor.set_layout(DataLayout::kNHWC); - dy_tensor.set_layout(DataLayout::kNHWC); - } - - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); - auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); - if (d_scale == nullptr) { - d_scale = &scale_grad_tmp; - } - if (d_bias == nullptr) { - d_bias = &bias_grad_tmp; - } - - auto stream = ctx.template device_context().stream(); - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - const auto &runner_update = - NpuOpRunner("BNTrainingUpdateGrad", - {dy_tensor, x_tensor, *running_mean, *running_variance}, - {*d_scale, *d_bias}, - {{"epsilon", epsilon}}); - runner_update.Run(stream); - } else { - const auto &runner_update = - NpuOpRunner("BNTrainingUpdateGrad", - {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance}, - {*d_scale, *d_bias}, - {{"epsilon", epsilon}}); - runner_update.Run(stream); - } - } - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - auto dx_tensor = - ctx.AllocateTmpTensor(d_x->dims(), dev_ctx); - dx_tensor.ShareDataWith(*d_x); - if (data_layout == DataLayout::kNHWC) { - dx_tensor.set_layout(DataLayout::kNHWC); - } - if (use_global_stats) { - if (x->dims().size() == 3) { - // BNInferGrad only support x rank = 4, - auto x_shape_vec = phi::vectorize(d_x->dims()); - if (data_layout == DataLayout::kNCHW) { - x_shape_vec.push_back(1); // expand NCL -> NCL1 - } else { - x_shape_vec.insert(x_shape_vec.begin() + 2, - 1); // expand NLC -> NL1C - } - auto x_new_shape = phi::make_ddim(x_shape_vec); - dx_tensor.Resize(x_new_shape); - dy_tensor.Resize(x_new_shape); - } - const auto *running_var = ctx.Input("Variance"); - const auto &runner_infer = - NpuOpRunner("BNInferGrad", - {dy_tensor, *scale, *running_var}, - {dx_tensor}, - {{"epsilon", epsilon}}); - runner_infer.Run(stream); - } else { - const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad", - {dy_tensor, - x_tensor, - *d_scale, - *d_bias, - *scale, - *saved_mean, - *saved_inv_variance}, - {dx_tensor}, - {{"epsilon", epsilon}}); - runner_reduce.Run(stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(batch_norm, - ops::NPUBatchNormOpKernel, - ops::NPUBatchNormOpKernel); -REGISTER_OP_NPU_KERNEL(batch_norm_grad, - ops::NPUBatchNormGradOpKernel, - ops::NPUBatchNormGradOpKernel); diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc deleted file mode 100644 index ed8872d90ef6f..0000000000000 --- a/paddle/fluid/operators/bce_loss_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class BCELossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("BinaryCrossEntropy", - {*x, *labels}, - {*out}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); - } -}; - -template -class BCELossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("BinaryCrossEntropyGrad", - {*x, *labels, *dout}, - {*dx}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - bce_loss, - ops::BCELossNPUKernel, - ops::BCELossNPUKernel); - -REGISTER_OP_NPU_KERNEL( - bce_loss_grad, - ops::BCELossGradNPUKernel, - ops::BCELossGradNPUKernel); diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc deleted file mode 100644 index 147d1be226255..0000000000000 --- a/paddle/fluid/operators/beam_search_op_npu.cc +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/beam_search_op.h" - -namespace ops = paddle::operators; -using NPUCtx = paddle::platform::NPUDeviceContext; - -REGISTER_OP_NPU_KERNEL(beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc deleted file mode 100644 index 411e112318d12..0000000000000 --- a/paddle/fluid/operators/cast_op_npu.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -namespace paddle { -namespace operators { - -static std::map - DTYPE_2_ACL_DTYPE = { - {framework::proto::VarType::BOOL, ACL_BOOL}, - {framework::proto::VarType::INT16, ACL_INT16}, - {framework::proto::VarType::INT32, ACL_INT32}, - {framework::proto::VarType::INT64, ACL_INT64}, - {framework::proto::VarType::FP16, ACL_FLOAT16}, - {framework::proto::VarType::FP32, ACL_FLOAT}, - {framework::proto::VarType::FP64, ACL_DOUBLE}, -}; - -template -class CastNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int dtype = ctx.Attr("out_dtype"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - - if (framework::TransToProtoVarType(x->dtype()) == dtype) { - // NOTE(zhiqiu): NPU cast op may result in wrong value, so - // add special case here. - VLOG(4) << "cast to same dtype:" << dtype; - out->mutable_data(place, x->type()); - framework::TensorCopy( - *x, - ctx.GetPlace(), - ctx.template device_context(), - out); - return; - } - - auto iter = DTYPE_2_ACL_DTYPE.find( - static_cast(dtype)); - int aclDtype = iter->second; - - if (dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*out}, {{"dst_type", static_cast(aclDtype)}}); - runner.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - cast, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel, - ops::CastNPUKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc deleted file mode 100644 index f22f58d1769ea..0000000000000 --- a/paddle/fluid/operators/clip_by_norm_op_npu.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/clip_by_norm_op.h" - -namespace paddle { -namespace operators { - -template -class NPUClipByNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto max_norm = context.Attr("max_norm"); - auto in_var = context.InputVar("X"); - - if (!(in_var->IsType())) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid input variable type, only support LodTensor" - "type, but got type is %s.", - framework::ToTypeName(in_var->Type()))); - } - - auto place = context.GetPlace(); - auto& dev_ctx = - context.template device_context(); - auto stream = dev_ctx.stream(); - - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(place); - - PADDLE_ENFORCE_NOT_NULL(input, - platform::errors::InvalidArgument( - "Input(X) of ClipByNormOp should not be null. " - "Please check if it is created correctly.")); - - phi::DenseTensor square_sum(input->type()); - square_sum.mutable_data(framework::DDim({1}), place); - const auto& x_dims = input->dims(); - std::vector axis; - for (int i = 0; i < x_dims.size(); ++i) { - axis.push_back(i); - } - const auto& square_sum_runner = - NpuOpRunner("SquareSumV1", - {*input}, - {square_sum}, - {{"axis", axis}, {"keep_dims", false}}); - square_sum_runner.Run(stream); - - phi::DenseTensor x_norm(input->type()); - x_norm.mutable_data(framework::DDim({1}), place); - const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {}); - x_norm_runner.Run(stream); - - phi::DenseTensor x_norm_t; - framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t); - auto x_norm_v = static_cast(*x_norm_t.data()); - if (x_norm_v <= max_norm) { - framework::TensorCopy(*input, place, dev_ctx, output); - } else { - auto epsilon = x_norm_v <= static_cast(1e-30) - ? static_cast(1e-6) - : static_cast(0); - float scaling = max_norm / (x_norm_v + epsilon); - const auto& muls_runner = - NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}}); - muls_runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - clip_by_norm, - ops::NPUClipByNormKernel, - ops::NPUClipByNormKernel); diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc deleted file mode 100644 index 8977bd250e868..0000000000000 --- a/paddle/fluid/operators/clip_op_npu.cc +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ClipNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto min_tensor = - ctx.HasInput("Min") ? ctx.Input("Min") : nullptr; - auto max_tensor = - ctx.HasInput("Max") ? ctx.Input("Max") : nullptr; - - phi::DenseTensor min_tensor_temp(x->type()); - phi::DenseTensor max_tensor_temp(x->type()); - if (min_tensor == nullptr) { - auto min_value = static_cast(ctx.Attr("min")); - min_tensor_temp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&min_tensor_temp, min_value); - min_tensor = &min_tensor_temp; - } - - if (max_tensor == nullptr) { - auto max_value = static_cast(ctx.Attr("max")); - max_tensor_temp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&max_tensor_temp, max_value); - max_tensor = &max_tensor_temp; - } - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class ClipGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - auto* min_tensor = - ctx.HasInput("Min") ? ctx.Input("Min") : nullptr; - auto* max_tensor = - ctx.HasInput("Max") ? ctx.Input("Max") : nullptr; - - auto min_val = ctx.Attr("min"); - if (min_tensor) { - phi::DenseTensor min_data; - framework::TensorCopy( - *min_tensor, - platform::CPUPlace(), - ctx.template device_context(), - &min_data); - ctx.template device_context().Wait(); - min_val = static_cast(min_data.data()[0]); - } - - auto max_val = ctx.Attr("max"); - if (max_tensor) { - phi::DenseTensor max_data; - framework::TensorCopy( - *max_tensor, - platform::CPUPlace(), - ctx.template device_context(), - &max_data); - ctx.template device_context().Wait(); - max_val = static_cast(max_data.data()[0]); - } - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("HardtanhGrad", - {*x, *dout}, - {*dx}, - {{"min_val", min_val}, {"max_val", max_val}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - clip, - ops::ClipNPUKernel, - ops::ClipNPUKernel); - -REGISTER_OP_NPU_KERNEL( - clip_grad, - ops::ClipGradNPUKernel, - ops::ClipGradNPUKernel); diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index c20200f6be316..baee3d20daebd 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -84,73 +84,4 @@ if(WITH_ASCEND_CL) device_context enforce executor) - cc_test( - c_broadcast_op_npu_test - SRCS c_broadcast_op_npu_test.cc - DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_allreduce_sum_op_npu_test - SRCS c_allreduce_sum_op_npu_test.cc - DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_reducescatter_op_npu_test - SRCS c_reducescatter_op_npu_test.cc - DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_allgather_op_npu_test - SRCS c_allgather_op_npu_test.cc - DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_reduce_sum_op_npu_test - SRCS c_reduce_sum_op_npu_test.cc - DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_allreduce_max_op_npu_test - SRCS c_allreduce_max_op_npu_test.cc - DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - send_v2_op_npu_test - SRCS send_v2_op_npu_test.cc - DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - recv_v2_op_npu_test - SRCS recv_v2_op_npu_test.cc - DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - checknumeric - SRCS checknumeric_npu_test.cc - DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test( - c_sync_comm_stream_op_npu_test - SRCS c_sync_comm_stream_op_npu_test.cc - DEPS op_registry - c_broadcast_op - c_comm_init_hccl_op - c_sync_comm_stream_op - c_gen_hccl_id_op - gen_hccl_id_op_helper - ${COLLECTIVE_DEPS} - ascend_hccl - dynamic_loader - dynload_warpctc - scope - device_context - enforce - executor) - cc_test( - c_sync_calc_stream_op_npu_test - SRCS c_sync_calc_stream_op_npu_test.cc - DEPS op_registry - elementwise_add_op - c_sync_calc_stream_op - c_gen_hccl_id_op - gen_hccl_id_op_helper - ${COLLECTIVE_DEPS} - ascend_hccl - dynamic_loader - dynload_warpctc - scope - device_context - enforce - executor) endif() diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc deleted file mode 100644 index 296174656f7a1..0000000000000 --- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/collective/c_allgather_op.h" - -namespace paddle { -namespace operators { - -template -class CAllGatherOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_allgather, - ops::CAllGatherOpASCENDKernel, - ops::CAllGatherOpASCENDKernel, - ops::CAllGatherOpASCENDKernel, - ops::CAllGatherOpASCENDKernel, - ops::CAllGatherOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc deleted file mode 100644 index ca4fd7377102d..0000000000000 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_allgather_op.h" -#include "paddle/fluid/operators/collective/c_allreduce_op.h" -#include "paddle/fluid/operators/collective/c_broadcast_op.h" -#include "paddle/fluid/operators/collective/c_reducescatter_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_allgather); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_allgather, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(2) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - - std::vector init; - int rank_id = atoi(getenv("RANK_ID")); - - int num1 = 1; - int num2 = 4; - - for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0 + rank_id); - } - PrintDebugInfo("input data", init); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num1, num2}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx"); - attrs["ring_id"] = 0; - attrs["nranks"] = 2; - - auto op = f::OpRegistry::CreateOp( - "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - for (int i = 0; i < 10; i++) { - op->Run(*scope, place); - } - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - - EXPECT_EQ(out_vec.size(), init.size() * 2); - for (uint32_t i = 0; i < out_vec.size() / 2; i++) { - EXPECT_EQ(out_vec[i], 1.0); - } - for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 2.0); - } -} - -TEST(c_allgather, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHCCLAllGatherOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc deleted file mode 100644 index e7fc35a24e930..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - c_allreduce_max, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc deleted file mode 100644 index 65dcfaa711261..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_allgather_op.h" -#include "paddle/fluid/operators/collective/c_allreduce_op.h" -#include "paddle/fluid/operators/collective/c_broadcast_op.h" -#include "paddle/fluid/operators/collective/c_reducescatter_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_allreduce_max); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(2) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - - std::vector init; - int rank_id = atoi(getenv("RANK_ID")); - - int num1 = 100; - int num2 = 100; - - for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0 + rank_id * 3); - } - PrintDebugInfo("input data", init); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num1, num2}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx"); - attrs["ring_id"] = 0; - - auto op = f::OpRegistry::CreateOp( - "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - for (int i = 0; i < 10; i++) { - op->Run(*scope, place); - } - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - - EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 4.0); - } -} - -TEST(c_allreduce_max, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHCCLAllReduceOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc deleted file mode 100644 index 04beca3765d45..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - c_allreduce_min, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc deleted file mode 100644 index 21ae06f57c790..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - c_allreduce_prod, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc deleted file mode 100644 index ecc7fc566f68b..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - c_allreduce_sum, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc deleted file mode 100644 index cd1d66e0ea0ea..0000000000000 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_allreduce_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1 -// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test -// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0 -// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_allreduce_sum); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - std::cout << preStr << ":" << std::endl << debugstring; - for (auto ele : data) { - std::cout << ele << " "; - } - std::cout << std::endl; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -template -void TestHCCLAllReduceOp(f::Scope* scope, - const p::DeviceContext& ctx, - int iter) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - - int rank_id = atoi(getenv("RANK_ID")); - int num1 = 3; - int num2 = 128; - - std::vector init; - for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(static_cast(1.0 + rank_id)); - } - init[0] = static_cast(std::numeric_limits::quiet_NaN()); - PrintDebugInfo("input data", init); - - auto place = ctx.GetPlace(); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num1, num2}); - ctx.Wait(); - - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx_" + std::to_string(iter)); - attrs["ring_id"] = 0; - attrs["use_calc_stream"] = 1; - - auto op = f::OpRegistry::CreateOp( - "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - for (int i = 0; i < 1; i++) { - op->Run(*scope, place); - } - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - - float diff = static_cast(out_vec[0]) - 65504; - EXPECT_TRUE(diff < 0.1 && diff > -0.1); - EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 1; i < 10; i++) { - EXPECT_EQ(out_vec[i], static_cast(3.0)); - } -} - -TEST(c_allreduce_sum, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - // only support one device, if more than one device, use first default - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - - TestHCCLAllReduceOp(&scope, ctx, 1); - // TestHCCLAllReduceOp(&scope, ctx, 0); -} diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc deleted file mode 100644 index 8642dfd6088fa..0000000000000 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_broadcast_op.h" - -namespace paddle { -namespace operators { - -template -class CBroadcastOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_broadcast, - ops::CBroadcastOpASCENDKernel, - ops::CBroadcastOpASCENDKernel, - ops::CBroadcastOpASCENDKernel, - ops::CBroadcastOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc deleted file mode 100644 index fa6a7374de687..0000000000000 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ /dev/null @@ -1,175 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_broadcast_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_broadcast); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_broadcast, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(2) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - int num = 2; - std::vector init; - int rank_id = atoi(getenv("RANK_ID")); - - for (int64_t i = 0; i < num * num; ++i) { - init.push_back(1.0 + rank_id); - } - PrintDebugInfo("input data", init); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num, num}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num, num}); - tensor_out->mutable_data(place); // allocate - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx"); - attrs["root"] = 0; - attrs["ring_id"] = 0; - - auto op = f::OpRegistry::CreateOp( - "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - for (int i = 0; i < 10; i++) { - op->Run(*scope, place); - } - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 1.0); - } -} - -TEST(c_broadcast, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHCCLBroadcastOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc deleted file mode 100644 index ef23a8a87e733..0000000000000 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ /dev/null @@ -1,270 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/collective/c_embedding_op.h" - -namespace paddle { -namespace operators { - -template -inline void FillNPU(Tensor *dst, - T val, - const framework::ExecutionContext &context) { - Tensor value(dst->type()); - value.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&value, static_cast(val)); - - auto stream = - context.template device_context() - .stream(); - - const auto &runner = NpuOpRunner( - "FillD", {value}, {*dst}, {{"dims", phi::vectorize(dst->dims())}}); - runner.Run(stream); -} - -template -void shard_index(const Tensor &table_t, - const Tensor &ids_t, - int64_t start_idx, - const Tensor &id_t, - const framework::ExecutionContext &context) { - const int height = table_t.dims()[0]; - - auto stream = - context.template device_context() - .stream(); - phi::DenseTensor id_t_d; - id_t_d.mutable_data(ids_t.dims(), context.GetPlace()); - FillNPU(&id_t_d, static_cast(0.0), context); - id_t_d.Resize(ids_t.dims()); - - phi::DenseTensor id_t_u; - id_t_u.mutable_data(ids_t.dims(), context.GetPlace()); - FillNPU(&id_t_u, static_cast(height - 1), context); - id_t_u.Resize(ids_t.dims()); - - phi::DenseTensor id_matched_d; - id_matched_d.mutable_data(ids_t.dims(), context.GetPlace()); - phi::DenseTensor id_matched_u; - id_matched_u.mutable_data(ids_t.dims(), context.GetPlace()); - phi::DenseTensor ignore_tensor; - ignore_tensor.mutable_data(ids_t.dims(), context.GetPlace()); - FillNPU(&ignore_tensor, static_cast(height), context); - ignore_tensor.Resize(ids_t.dims()); - - NpuOpRunner sub_runner; -#if (CANN_VERSION_CODE >= 503003) - Tensor factor_tensor(ids_t.type()); - factor_tensor.mutable_data({1}, context.GetPlace()); - paddle::framework::TensorFromVector(std::vector{static_cast(start_idx)}, - context.device_context(), - &factor_tensor); - sub_runner.SetType("Sub") - .AddInput(ids_t) - .AddInput(factor_tensor) - .AddOutput(id_t); -#else - sub_runner.SetType("Sub") - .AddInput(ids_t) - .AddInput(std::vector{static_cast(start_idx)}) - .AddOutput(id_t); -#endif - sub_runner.Run(); - - NpuOpRunner lessequal1_runner; - lessequal1_runner.SetType("LessEqual") - .AddInput(id_t) - .AddInput(id_t_u) - .AddOutput(id_matched_u); - lessequal1_runner.Run(); - - NpuOpRunner lessequal2_runner; - lessequal2_runner.SetType("LessEqual") - .AddInput(id_t_d) - .AddInput(id_t) - .AddOutput(id_matched_d); - lessequal2_runner.Run(); - - NpuOpRunner("Equal", {id_matched_u, id_matched_d}, {id_matched_d}, {}) - .Run(stream); - NpuOpRunner("Select", {id_matched_d, id_t, ignore_tensor}, {id_t}, {}) - .Run(stream); -} - -template -void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { - auto *table_t = context.Input("W"); - auto *ids_t = context.Input("Ids"); - auto *output_t = context.Output("Out"); - const int64_t start_idx = context.Attr("start_index"); - - auto stream = - context.template device_context() - .stream(); - - phi::DenseTensor ids_t_local; - ids_t_local.mutable_data(ids_t->dims(), context.GetPlace()); - shard_index(*table_t, *ids_t, start_idx, ids_t_local, context); - - auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]}); - phi::DenseTensor table_t_pad; - - size_t mem_size = table_t->numel() * phi::SizeOf(table_t->dtype()); - size_t line_mem_size = table_t->dims()[1] * phi::SizeOf(table_t->dtype()); - PADDLE_ENFORCE_EQ(line_mem_size % 64, - 0, - platform::errors::InvalidArgument( - "NPU only accept the second dim must align by 64")); - - VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size - << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims(); - - uint8_t *pad_data = reinterpret_cast( - table_t_pad.mutable_data(pad_shape, context.GetPlace())); - platform::NPUMemcpyAsync(pad_data, - table_t->data(), - mem_size, - ACL_MEMCPY_DEVICE_TO_DEVICE, - stream, - mem_size); - platform::NPUMemsetAsync( - pad_data + mem_size, 0, line_mem_size, stream, line_mem_size); - - output_t->mutable_data(context.GetPlace()); - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(table_t_pad) - .AddInput(ids_t_local) - .AddInput(std::vector{0}) -#if (CANN_VERSION_CODE >= 503003) - .AddAttrs({{"batch_dims", 0}}) -#endif - .AddOutput(*output_t); - runner.Run(); -} - -template -class CEmbeddingNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *ids_t = context.Input("Ids"); - - const auto &index_type = framework::TransToProtoVarType(ids_t->dtype()); - if (index_type == framework::proto::VarType::INT32) { - NPUGetIdsEmbedding(context); - } else { - PADDLE_THROW(platform::errors::Unavailable( - "NPU c_embedding ids only support int32.")); - } - } -}; - -template -void NPUUpdateEmbedding(const framework::ExecutionContext &context) { - // get inputs - const int64_t start_idx = context.Attr("start_index"); - auto ids_t = context.Input("Ids"); - auto d_output_t = - context.Input(framework::GradVarName("Out")); - auto table_t = context.Input("W"); - auto table_grad_t = - context.Output(framework::GradVarName("W")); - - VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t - << ", table_t:" << table_t << ", table_grad_t" << table_grad_t; - - auto stream = - context.template device_context() - .stream(); - - // convert ids_t to local valid - phi::DenseTensor ids_t_local; - ids_t_local.mutable_data(ids_t->dims(), context.GetPlace()); - shard_index(*table_t, *ids_t, start_idx, ids_t_local, context); - - // padding table_t -> table_t_pad - auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]}); - phi::DenseTensor table_t_pad; - - // set table_t_pad to zero - uint8_t *pad_data = reinterpret_cast( - table_t_pad.mutable_data(pad_shape, context.GetPlace())); - size_t table_t_pad_mem_size = - table_t_pad.numel() * - framework::SizeOfType( - framework::TransToProtoVarType(table_t_pad.dtype())); - platform::NPUMemsetAsync( - pad_data, 0, table_t_pad_mem_size, stream, table_t_pad_mem_size); - - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", - {table_t_pad, ids_t_local, *d_output_t}, - {table_t_pad}, - {{"use_locking", true}}); - runner_scatter.Run(stream); - - // copy table_t_pad to table_t - T *dst = table_grad_t->mutable_data(table_t->dims(), context.GetPlace()); - const size_t mem_size = - table_grad_t->numel() * phi::SizeOf(table_grad_t->dtype()); - - // check align - size_t line_mem_size = - table_grad_t->dims()[1] * phi::SizeOf(table_grad_t->dtype()); - PADDLE_ENFORCE_EQ(line_mem_size % 64, - 0, - platform::errors::InvalidArgument( - "NPU only accept the second dim must align by 64")); - - platform::NPUMemcpyAsync( - dst, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream, mem_size); -} - -template -class CEmbeddingGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *ids_t = context.Input("Ids"); - - const auto &index_type = framework::TransToProtoVarType(ids_t->dtype()); - if (index_type == framework::proto::VarType::INT32) { - NPUUpdateEmbedding(context); - } else { - PADDLE_THROW( - platform::errors::Unavailable("c_embedding ids only support int32.")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(c_embedding, - ops::CEmbeddingNPUKernel, - ops::CEmbeddingNPUKernel, - ops::CEmbeddingNPUKernel); -REGISTER_OP_NPU_KERNEL(c_embedding_grad, - ops::CEmbeddingGradNPUKernel, - ops::CEmbeddingGradNPUKernel, - ops::CEmbeddingGradNPUKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op_npu.cc b/paddle/fluid/operators/collective/c_identity_op_npu.cc deleted file mode 100644 index b97743cf14d6f..0000000000000 --- a/paddle/fluid/operators/collective/c_identity_op_npu.cc +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_identity_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_identity, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel, - ops::CIdentityOpKernel); diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc deleted file mode 100644 index 50d52e0ad1ac8..0000000000000 --- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_reduce_max, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc deleted file mode 100644 index b94da957e8f16..0000000000000 --- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_reduce_min, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc deleted file mode 100644 index 7515ffad25f3e..0000000000000 --- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_reduce_prod, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc deleted file mode 100644 index 6f056520df20d..0000000000000 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_reduce_sum, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel, - ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc deleted file mode 100644 index 67831aee39b82..0000000000000 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_reduce_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_reduce_sum); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(3) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - - int rank_id = atoi(getenv("RANK_ID")); - int num1 = 3; - int num2 = 128; - - std::vector init; - for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0 + rank_id); - } - PrintDebugInfo("input data", init); - - auto place = ctx.GetPlace(); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num1, num2}); - ctx.Wait(); - - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx_" + std::to_string(iter)); - attrs["ring_id"] = 0; - int root_id = 0; - attrs["root_id"] = root_id; - - auto op = f::OpRegistry::CreateOp( - "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - - EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - if (rank_id == root_id) { - EXPECT_EQ(out_vec[i], 3.0); - } else { - EXPECT_EQ(out_vec[i], init[i]); - } - } -} - -TEST(c_reduce_sum, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - for (int i = 0; i < 2; i++) { - VLOG(2) << "iter num: " << i; - TestHCCLReduceOp(&scope, ctx, i); - } -} diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc deleted file mode 100644 index d6bfcd1635a34..0000000000000 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/c_reducescatter_op.h" - -namespace paddle { -namespace operators { - -template -class CReduceScatterOpAscendKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(c_reducescatter, - ops::CReduceScatterOpAscendKernel, - ops::CReduceScatterOpAscendKernel, - ops::CReduceScatterOpAscendKernel, - ops::CReduceScatterOpAscendKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc deleted file mode 100644 index 3adaa8f4c85e6..0000000000000 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ /dev/null @@ -1,183 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_allgather_op.h" -#include "paddle/fluid/operators/collective/c_allreduce_op.h" -#include "paddle/fluid/operators/collective/c_broadcast_op.h" -#include "paddle/fluid/operators/collective/c_reducescatter_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_reducescatter); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_reducescatter, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(2) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - - std::vector init; - int num1 = 4; - int num2 = 1; - - for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0); - } - PrintDebugInfo("input data", init); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num1, num2}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate - - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx"); - attrs["ring_id"] = 0; - attrs["nranks"] = 2; - - auto op = f::OpRegistry::CreateOp( - "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - int iter_num = 10; - for (int i = 0; i < iter_num; i++) { - op->Run(*scope, place); - ctx.Wait(); - } - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - PrintDebugInfo("output data", out_vec); - EXPECT_EQ(out_vec.size(), init.size() / 2); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 2.0); - } -} - -TEST(c_reducescatter, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHCCLReduceScatterOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc deleted file mode 100644 index abd25fa9e8f61..0000000000000 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(elementwise_add); -USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto y = scope->Var("Y"); - auto tensor_y = y->GetMutable(); - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - std::vector init_y; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_y.push_back(static_cast(2.0)); - } - - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - paddle::framework::TensorFromVector(init_y, ctx, tensor_y); - tensor_y->Resize({10, 10}); - - f::AttributeMap attrs; - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // sync data - auto sync_op0 = f::OpRegistry::CreateOp( - "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - sync_op0->Run(*scope, place); - - // run - - auto op = f::OpRegistry::CreateOp("elementwise_add", - {{"X", {"X"}}, {"Y", {"Y"}}}, - {{"Out", {"Out"}}}, - attrs); - - op->Run(*scope, place); - - // sync op run - auto sync_op = f::OpRegistry::CreateOp( - "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - sync_op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - // sync op copy - auto sync_op2 = f::OpRegistry::CreateOp( - "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - sync_op2->Run(*scope, place); - - float expected = 3.0; - - EXPECT_EQ(out_vec.size(), init_x.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(expected)); - } -} - -TEST(c_sync_calc_stream, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc deleted file mode 100644 index daac829c32c5a..0000000000000 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ /dev/null @@ -1,184 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_broadcast_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_broadcast); -USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(c_broadcast, NPU); - -DECLARE_string(selected_npus); - -template -void PrintDebugInfo(const std::string preStr, const std::vector& data) { - std::string debugstring = ""; - for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); - } - VLOG(2) << preStr << ":" << std::endl << debugstring; -} - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { - std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; - // init - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - int num = 2; - std::vector init; - int rank_id = atoi(getenv("RANK_ID")); - std::cout << "rank_id:" << rank_id << std::endl; - for (int64_t i = 0; i < num * num; ++i) { - init.push_back(1.0 + rank_id); - std::cout << init[0]; - } - std::cout << std::endl; - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num, num}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("OutData"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num, num}); - tensor_out->mutable_data(place); // allocate - - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs["tag"] = std::string("tagx"); - attrs["root"] = 0; - attrs["ring_id"] = 0; - - auto op = f::OpRegistry::CreateOp( - "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - op->Run(*scope, place); - - // comm sync - - auto sync_op = f::OpRegistry::CreateOp( - "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - sync_op->Run(*scope, place); - - // ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 1.0); - } -} - -TEST(c_sync_comm_stream_op, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - // only support one device, if more than one device, use first default - p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHCCLBroadcastOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc deleted file mode 100644 index 61d51f2857788..0000000000000 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/c_allreduce_op.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(c_allreduce_sum); -USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); -DECLARE_string(selected_npus); - -template -bool Check(T value, int size = 2 * 512 * 8192) { - f::Scope scope; - auto x = scope.Var("in"); - auto& ctx = *dynamic_cast( - p::DeviceContextPool::Instance().Get(p::NPUPlace(0))); - auto place = ctx.GetPlace(); - - auto tensor_x = x->GetMutable(); - tensor_x->Resize({size}); - tensor_x->mutable_data(place); // allocate - - std::vector init; - for (int64_t i = 0; i < size; ++i) { - init.push_back(static_cast(value)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x); - return result; -} - -TEST(check_numeric, NPU) { - auto inf = std::numeric_limits::infinity(); - auto fp16_inf = static_cast(inf); - auto nan = NAN; - auto fp16_nan = static_cast(nan); - - bool result = false; - // Normal - VLOG(0) << "start normal"; - result = Check(static_cast(65546)); - ASSERT_FALSE(result); - Check(static_cast(1.0)); - ASSERT_FALSE(result); - - // Inf - VLOG(0) << "start inf"; - result = Check(fp16_inf); - ASSERT_FALSE(result); - result = Check(inf); - ASSERT_FALSE(result); - - // Nan - VLOG(0) << "start nan"; - result = Check(fp16_nan); - ASSERT_TRUE(result); - result = Check(nan); - ASSERT_TRUE(result); -} diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc deleted file mode 100644 index 0054cfa468746..0000000000000 --- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/collective/c_allreduce_op.h" - -namespace paddle { -namespace platform { -struct ASCENDPlace; -} // namespace platform -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - mp_allreduce_sum, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel, - ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc deleted file mode 100644 index 28a4266dcc989..0000000000000 --- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/operators/collective/partial_allgather_op.h" -#include "paddle/fluid/platform/collective_helper.h" - -namespace paddle { -namespace operators { - -template -class CallPartialGatherOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(partial_allgather, - ops::CallPartialGatherOpASCENDKernel, - ops::CallPartialGatherOpASCENDKernel, - ops::CallPartialGatherOpASCENDKernel, - ops::CallPartialGatherOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/partial_recv_op_npu.cc b/paddle/fluid/operators/collective/partial_recv_op_npu.cc deleted file mode 100644 index a5c53a7900a20..0000000000000 --- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/partial_recv_op.h" -#include "paddle/fluid/platform/collective_helper.h" - -namespace paddle { -namespace operators { - -template -class PartialRecvOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(partial_recv, - ops::PartialRecvOpASCENDKernel, - ops::PartialRecvOpASCENDKernel, - ops::PartialRecvOpASCENDKernel, - ops::PartialRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/partial_send_op_npu.cc b/paddle/fluid/operators/collective/partial_send_op_npu.cc deleted file mode 100644 index 47343148d8ae9..0000000000000 --- a/paddle/fluid/operators/collective/partial_send_op_npu.cc +++ /dev/null @@ -1,40 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/send_v2_op.h" -#include "paddle/fluid/platform/collective_helper.h" - -namespace paddle { -namespace operators { - -template -class PartialSendOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(partial_send, - ops::PartialSendOpASCENDKernel, - ops::PartialSendOpASCENDKernel, - ops::PartialSendOpASCENDKernel, - ops::PartialSendOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc deleted file mode 100644 index 6ea6c12efe319..0000000000000 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/recv_v2_op.h" - -#include "paddle/fluid/distributed/collective/process_group.h" -#include "paddle/phi/api/include/tensor.h" - -namespace paddle { -namespace operators { - -template -class CRecvOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(recv_v2, - ops::CRecvOpASCENDKernel, - ops::CRecvOpASCENDKernel, - ops::CRecvOpASCENDKernel, - ops::CRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc deleted file mode 100644 index ba298342a123e..0000000000000 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ /dev/null @@ -1,159 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/operators/collective/recv_v2_op.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(recv_v2); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(recv_v2, NPU); - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) { - std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; - - int num = atoi(getenv("DATA_SIZE")); - EXPECT_GT(num, 0); - EXPECT_LT(num, 1 << 15); - int rank_id = atoi(getenv("RANK_ID")); - VLOG(3) << "rank_id:" << rank_id << std::endl; - - ctx.Wait(); - auto place = ctx.GetPlace(); - auto out = scope->Var("Data"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({num, num}); - tensor_out->mutable_data(place); // allocate - - ctx.Wait(); - - f::AttributeMap attrs; - attrs["tag"] = std::string("srtest"); - attrs["peer"] = atoi(getenv("SRC_RANK")); - attrs["ring_id"] = 0; - attrs["srTag"] = 0; - std::vector out_shape; - out_shape.push_back(num); - out_shape.push_back(num); - attrs["out_shape"] = out_shape; - - auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs); - VLOG(3) << "CreateOp recv_v2"; - - for (int i = 0; i < 10; i++) { - op->Run(*scope, place); - } - VLOG(3) << "Run op recv_v2"; - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); - EXPECT_EQ(out_vec == init, true); -} - -TEST(recv_v2, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - char* npu_id = getenv("FLAGS_selected_npus"); - VLOG(3) << "Select npu:" << npu_id; - p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHcomRecvOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc deleted file mode 100644 index 9500f4c211a9b..0000000000000 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/send_v2_op.h" - -#include "paddle/fluid/distributed/collective/process_group.h" -#include "paddle/phi/api/include/tensor.h" - -namespace paddle { -namespace operators { - -template -class CSendOpASCENDKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with NPU.")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(send_v2, - ops::CSendOpASCENDKernel, - ops::CSendOpASCENDKernel, - ops::CSendOpASCENDKernel, - ops::CSendOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc deleted file mode 100644 index bb39fd5110546..0000000000000 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ /dev/null @@ -1,149 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#include "paddle/fluid/operators/collective/send_v2_op.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(send_v2); -USE_NO_KERNEL_OP(c_gen_hccl_id); -USE_NO_KERNEL_OP(c_comm_init_hccl); -USE_OP_DEVICE_KERNEL(send_v2, NPU); - -void PrepareUniqueId(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - std::vector rank_ids{0, 1}; - f::AttributeMap gen_hccl_id; - - std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; - gen_hccl_id["rank"] = rank_id; - gen_hccl_id["endpoint"] = endpointList[rank_id]; - std::vector other_endpoints = { - endpointList[rank_id == 0 ? 1 : 0]}; - gen_hccl_id["other_endpoints"] = other_endpoints; - - auto out = scope->Var("Out"); - auto id = out->GetMutable(); - - VLOG(3) << "break"; - - auto comm_init_op = f::OpRegistry::CreateOp( - "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); - VLOG(3) << "break"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); - - memcpy(hccl_id, id, 1024); -} - -void Prepare(f::Scope* scope, - const p::DeviceContext& ctx, - HcclRootInfo* hccl_id) { - auto x = scope->Var("X"); - auto id = x->GetMutable(); - - memcpy(id, hccl_id, 1024); - - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - - VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id - << "; rank_id = " << rank_id - << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); - - // std::vector rank_ids{0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["rank_ids"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - // comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp( - "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); -} - -void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) { - std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; - auto x = scope->Var("Data"); - auto tensor_x = x->GetMutable(); - int num = atoi(getenv("DATA_SIZE")); - - EXPECT_GT(num, 0); - EXPECT_LT(num, 1 << 15); - std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); - int rank_id = atoi(getenv("RANK_ID")); - VLOG(3) << "rank id:" << rank_id; - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({num, num}); - ctx.Wait(); - auto place = ctx.GetPlace(); - ctx.Wait(); - - f::AttributeMap attrs; - attrs["tag"] = std::string("srtest"); - attrs["peer"] = atoi(getenv("DEST_RANK")); - attrs["ring_id"] = 0; - attrs["srTag"] = 0; - - auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs); - - for (int i = 0; i < 10; i++) { - op->Run(*scope, place); - } - VLOG(3) << "send run over"; - ctx.Wait(); -} - -TEST(send_v2, NPU) { - f::Scope scope; - HcclRootInfo hccl_id; - - char* npu_id = getenv("FLAGS_selected_npus"); - VLOG(3) << "Select npu:" << npu_id; - p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); - - PrepareUniqueId(&scope, ctx, &hccl_id); - Prepare(&scope, ctx, &hccl_id); - TestHcomSendOp(&scope, ctx); -} diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc deleted file mode 100644 index 491d44efa7261..0000000000000 --- a/paddle/fluid/operators/concat_op_npu.cc +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/concat_op.h" - -namespace paddle { -namespace operators { - -template -class ConcatNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - phi::DenseTensor* out = ctx.Output("Out"); - PADDLE_ENFORCE_NOT_NULL(ins[0], - platform::errors::NotFound( - "The first input tensor is not initalized.")); - auto axis = ctx.Attr("axis"); - - if (ctx.HasInput("AxisTensor")) { - PADDLE_THROW(platform::errors::NotFound( - "The AxisTensor is not supported on NPU now.")); - } - axis = ComputeAxis(static_cast(axis), - static_cast(ins[0]->dims().size())); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - std::vector inputs; - std::vector names; - for (size_t i = 0; i < ins.size(); ++i) { - if (ins[i] && ins[i]->numel() > 0) { - inputs.push_back(*ins[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; - } - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*out}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } -}; - -template -class ConcatGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto ins = ctx.MultiInput("X"); - auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); - auto outs = ctx.MultiOutput(framework::GradVarName("X")); - - PADDLE_ENFORCE_NOT_NULL(ins[0], - platform::errors::NotFound( - "The first input tensor is not initalized.")); - - auto axis = ctx.Attr("axis"); - - axis = ComputeAxis(static_cast(axis), - static_cast(ins[0]->dims().size())); - - int offset = 0; - auto stream = - ctx.template device_context() - .stream(); - for (size_t j = 0; j < outs.size(); ++j) { - // For stop gradient - // get output tensor that the name is not kEmptyVarName - if (out_var_names[j] != framework::kEmptyVarName && - outs[j]->numel() != 0UL) { - outs[j]->mutable_data(ctx.GetPlace()); - std::vector offsets; - std::vector sizes; - for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { - if (dim == axis) { - offsets.push_back(offset); - sizes.push_back(ins[j]->dims()[dim]); - } else { - offsets.push_back(0); - sizes.push_back(ins[j]->dims()[dim]); - } - } - const auto& runner = - NpuOpRunner("SliceD", - {*out_grad}, - {*outs[j]}, - {{"offsets", offsets}, {"size", sizes}}); - runner.Run(stream); - } - if (ins[j]->numel() != 0UL) { - offset += ins[j]->dims()[axis]; - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(concat, - ops::ConcatNPUKernel, - ops::ConcatNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ConcatNPUKernel, -#endif - ops::ConcatNPUKernel); - -REGISTER_OP_NPU_KERNEL(concat_grad, - ops::ConcatGradNPUKernel, - ops::ConcatGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ConcatGradNPUKernel, -#endif - ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc deleted file mode 100644 index ae6fd8a6fb222..0000000000000 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" - -namespace paddle { -namespace operators { - -template -class EqualNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class NotEqualNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("NotEqual", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class LessThanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Less", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class LessEqualNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class GreaterThanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class GreaterEqualNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - equal, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel); - -REGISTER_OP_NPU_KERNEL( - not_equal, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel, - ops::NotEqualNPUKernel); - -REGISTER_OP_NPU_KERNEL( - less_than, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel); - -REGISTER_OP_NPU_KERNEL( - less_equal, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel); - -REGISTER_OP_NPU_KERNEL( - greater_than, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel); - -REGISTER_OP_NPU_KERNEL( - greater_equal, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel); diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc deleted file mode 100644 index de29f3689cd84..0000000000000 --- a/paddle/fluid/operators/controlflow/logical_op_npu.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class LogicalNotNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class LogicalOrNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("LogicalOr", {*x, *y}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class LogicalAndPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("LogicalAnd", {*x, *y}, {*out}, {}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(logical_not, - ops::LogicalNotNPUKernel); - -REGISTER_OP_NPU_KERNEL(logical_or, - ops::LogicalOrNPUKernel); - -REGISTER_OP_NPU_KERNEL(logical_and, - ops::LogicalAndPUKernel); diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc deleted file mode 100644 index 44fb1aa5a1759..0000000000000 --- a/paddle/fluid/operators/conv_op_npu.cc +++ /dev/null @@ -1,688 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/conv_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; -static void CastToFP16(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT16) - .Run(stream); -} - -static void CastToFP32(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Cast") - .AddInput(in) - .AddOutput(*out) - .AddAttr("dst_type", ACL_FLOAT) - .Run(stream); -} - -template -class DepthwiseConvNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1], - platform::errors::InvalidArgument( - "ShapeError: The output channels must be equal to the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1], - input->dims()[1], - platform::errors::InvalidArgument( - "ShapeError: The output channels must be equal to the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], - input->dims()[1])); - } - - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - auto stream = ctx.template device_context().stream(); - - // Transform filter (n, 1, h, w) --> (1, n, h, w) - phi::DenseTensor transformed_filter(filter->type()); - transformed_filter.mutable_data({filter->dims()[1], - filter->dims()[0], - filter->dims()[2], - filter->dims()[3]}, - ctx.device_context().GetPlace()); - std::vector perm = {1, 0, 2, 3}; - const auto& runner_trans = NpuOpRunner( - "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); - runner_trans.Run(stream); - - const auto& runner = NpuOpRunner("DepthwiseConv2D", - {input_tensor, transformed_filter}, - {output_tensor}, - {{"strides", strides}, - {"dilations", dilations}, - {"pads", padding}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class DepthwiseConvGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - auto output_grad = - ctx.Input(framework::GradVarName("Output")); - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - auto filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - auto stream = ctx.template device_context().stream(); - - // Transform filter (n, 1, h, w) --> (1, n, h, w) - phi::DenseTensor transformed_filter(filter->type()); - transformed_filter.mutable_data({filter->dims()[1], - filter->dims()[0], - filter->dims()[2], - filter->dims()[3]}, - ctx.device_context().GetPlace()); - std::vector perm = {1, 0, 2, 3}; - const auto& runner_trans = NpuOpRunner( - "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}}); - runner_trans.Run(stream); - - // construct NPU attr - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_EQ( - (dilations[2] == 1 && dilations[3] == 1), - true, - platform::errors::InvalidArgument( - "dilation_h and dilation_w in DepthwiseConv2DBackpropFilterD " - "must be equal to 1, but got dilation_h %d, dilation_w %d", - dilation[2], - dilation[3])); - - NpuOpRunner runner; - runner.SetType("DepthwiseConv2DBackpropFilterD") - .AddInput(input_tensor) - .AddInput(output_grad_tensor) - .AddOutput(*filter_grad) - .AddAttr("filter_size", phi::vectorize(transformed_filter.dims())) - .AddAttr("strides", strides) - .AddAttr("dilations", dilations) - .AddAttr("pads", padding) - .AddAttr("data_format", data_format) - .Run(stream); - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - NpuOpRunner runner; - runner.SetType("DepthwiseConv2DBackpropInputD") - .AddInput(transformed_filter) - .AddInput(output_grad_tensor) - .AddOutput(input_grad_tensor) - .AddAttr("input_size", phi::vectorize(input->dims())) - .AddAttr("strides", strides) - .AddAttr("dilations", dilations) - .AddAttr("pads", padding) - .AddAttr("data_format", data_format) - .Run(stream); - } - } -}; - -template -class NPUConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv2D", - {input_tensor, *filter}, - {output_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class NPUConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = - ctx.Input(framework::GradVarName("Output")); - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - auto filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - std::vector filter_shape_vec = phi::vectorize(filter->dims()); - - phi::DenseTensor filter_grad_fp32(phi::DataType::FLOAT32); - filter_grad_fp32.Resize(filter_grad->dims()); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::FP16) { - CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32); - } else { - filter_grad_fp32.ShareDataWith(*filter_grad); - } - - const auto& runner = NpuOpRunner("Conv2DBackpropFilterD", - {input_tensor, output_grad_tensor}, - {filter_grad_fp32}, - {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - - if (framework::TransToProtoVarType(input->dtype()) == - framework::proto::VarType::FP16) { - CastToFP16(ctx, stream, filter_grad_fp32, filter_grad); - } - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - std::vector input_shape_vec = phi::vectorize(input->dims()); - - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - const auto& runner = NpuOpRunner("Conv2DBackpropInputD", - {*filter, output_grad_tensor}, - {input_grad_tensor}, - {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -template -class NPUConv3dKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - PADDLE_ENFORCE_EQ(data_format, - "NCDHW", - platform::errors::Unimplemented( - "the data_format must be NCDHW in " - "the npu kernel of conv3d, but got data_format " - "= [%s]", - data_format)); - - PADDLE_ENFORCE_EQ(groups, - 1, - platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); - - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto input_tensor = - ctx.AllocateTmpTensor(input->dims(), dev_ctx); - auto filter_tensor = - ctx.AllocateTmpTensor(filter->dims(), dev_ctx); - auto output_tensor = - ctx.AllocateTmpTensor(output->dims(), dev_ctx); - - input_tensor.ShareDataWith(*input); - filter_tensor.ShareDataWith(*filter); - output_tensor.ShareDataWith(*output); - - input_tensor.set_layout(DataLayout::kNCDHW); - filter_tensor.set_layout(DataLayout::kNCDHW); - output_tensor.set_layout(DataLayout::kNCDHW); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(5, 1); - std::vector dilations_vec(5, 1); - - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - strides_vec[4] = strides[2]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - dilations_vec[4] = dilations[2]; - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv3D", - {input_tensor, filter_tensor}, - {output_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class NPUConv3dGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - const phi::DenseTensor* output_grad = - ctx.Input(framework::GradVarName("Output")); - phi::DenseTensor* input_grad = - ctx.Output(framework::GradVarName("Input")); - phi::DenseTensor* filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - PADDLE_ENFORCE_EQ(data_format, - "NCDHW", - platform::errors::Unimplemented( - "the data_format must be NCDHW in " - "the npu kernel of conv3d, but got data_format " - "= [%s]", - data_format)); - - PADDLE_ENFORCE_EQ(groups, - 1, - platform::errors::Unimplemented( - "the groups must be 1 in " - "the npu kernel of conv3d, but got groups " - "= [%d]", - groups)); - - auto& dev_ctx = ctx.template device_context(); - auto input_tensor = - ctx.AllocateTmpTensor(input->dims(), dev_ctx); - auto filter_tensor = - ctx.AllocateTmpTensor(filter->dims(), dev_ctx); - auto output_grad_tensor = ctx.AllocateTmpTensor( - output_grad->dims(), dev_ctx); - - input_tensor.ShareDataWith(*input); - filter_tensor.ShareDataWith(*filter); - output_grad_tensor.ShareDataWith(*output_grad); - - input_tensor.set_layout(DataLayout::kNCDHW); - filter_tensor.set_layout(DataLayout::kNCDHW); - output_grad_tensor.set_layout(DataLayout::kNCDHW); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(5, 1); - std::vector dilations_vec(5, 1); - - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - strides_vec[4] = strides[2]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - dilations_vec[4] = dilations[2]; - - auto stream = ctx.template device_context().stream(); - - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - std::vector filter_shape_vec = phi::vectorize(filter->dims()); - - phi::DenseTensor filter_grad_tensor = - ctx.AllocateTmpTensor(filter_grad->dims(), - dev_ctx); - filter_grad_tensor.ShareDataWith(*filter_grad); - filter_grad_tensor.set_layout(DataLayout::kNCDHW); - - const auto& runner = NpuOpRunner("Conv3DBackpropFilterD", - {input_tensor, output_grad_tensor}, - {filter_grad_tensor}, - {{"filter_size", filter_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - std::vector input_shape_vec = phi::vectorize(input->dims()); - - phi::DenseTensor input_grad_tensor = - ctx.AllocateTmpTensor(input_grad->dims(), - dev_ctx); - input_grad_tensor.ShareDataWith(*input_grad); - input_grad_tensor.set_layout(DataLayout::kNCDHW); - - const auto& runner = NpuOpRunner("Conv3DBackpropInputD", - {filter_tensor, output_grad_tensor}, - {input_grad_tensor}, - {{"input_size", input_shape_vec}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(depthwise_conv2d, - ops::DepthwiseConvNPUKernel, - ops::DepthwiseConvNPUKernel); - -REGISTER_OP_NPU_KERNEL(depthwise_conv2d_grad, - ops::DepthwiseConvGradNPUKernel, - ops::DepthwiseConvGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv2d, - ops::NPUConvOpKernel, - ops::NPUConvOpKernel); - -REGISTER_OP_NPU_KERNEL(conv2d_grad, - ops::NPUConvGradOpKernel, - ops::NPUConvGradOpKernel); - -REGISTER_OP_NPU_KERNEL(conv3d, - ops::NPUConv3dKernel, - ops::NPUConv3dKernel); - -REGISTER_OP_NPU_KERNEL(conv3d_grad, - ops::NPUConv3dGradKernel, - ops::NPUConv3dGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc deleted file mode 100644 index f9da50848df2a..0000000000000 --- a/paddle/fluid/operators/conv_transpose_op_npu.cc +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/phi/kernels/cpu/conv_util.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class Conv2DTransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - std::vector output_padding = - ctx.Attr>("output_padding"); - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - const std::string data_format = ctx.Attr("data_format"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - // check dimension - const bool channel_last = data_format == "NHWC"; - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - // construct NPU attr - std::vector strides(4, 1); - std::vector dilations(4, 1); - - phi::DenseTensor input_tensor, output_tensor; - input_tensor.ShareDataWith(*input); - output_tensor.ShareDataWith(*output); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_tensor.set_layout(DataLayout::kNHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - dilations[1] = dilation[0]; - dilations[2] = dilation[1]; - } else { - strides[2] = stride[0]; - strides[3] = stride[1]; - dilations[2] = dilation[0]; - dilations[3] = dilation[1]; - } - - for (auto i = output_padding.size(); i < 4; ++i) { - output_padding.insert(output_padding.begin(), 0); - } - auto output_dim_vec = phi::vectorize(output_tensor.dims()); - - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Conv2DTransposeD", - {input_tensor, *filter}, - {output_tensor}, - {{"input_size", output_dim_vec}, - {"strides", strides}, - {"dilations", dilations}, - {"output_padding", output_padding}, - {"groups", groups}, - {"pads", padding}, - {"data_format", data_format}}); - runner.Run(stream); - } -}; - -template -class Conv2DTransposeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - const phi::DenseTensor* output_grad = - ctx.Input(framework::GradVarName("Output")); - phi::DenseTensor* input_grad = - ctx.Output(framework::GradVarName("Input")); - phi::DenseTensor* filter_grad = - ctx.Output(framework::GradVarName("Filter")); - - if ((!input_grad) && (!filter_grad)) return; - - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const int groups = ctx.Attr("groups"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const phi::DataLayout data_layout = phi::StringToDataLayout(data_format); - - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - // auto out_grad_dims = output_grad->dims(); - // const int batch_size = static_cast(input->dims()[0]); - - const bool channel_last = (data_layout == phi::DataLayout::kNHWC); - - framework::DDim in_data_dims; - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); - - std::vector strides_vec(4, 1); - std::vector dilations_vec(4, 1); - - phi::DenseTensor input_tensor, output_grad_tensor; - input_tensor.ShareDataWith(*input); - output_grad_tensor.ShareDataWith(*output_grad); - if (channel_last) { - input_tensor.set_layout(DataLayout::kNHWC); - output_grad_tensor.set_layout(DataLayout::kNHWC); - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - dilations_vec[1] = dilations[0]; - dilations_vec[2] = dilations[1]; - } else { - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - dilations_vec[2] = dilations[0]; - dilations_vec[3] = dilations[1]; - } - - auto stream = ctx.template device_context().stream(); - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Conv2DBackpropFilterD", - {output_grad_tensor, input_tensor}, - {*filter_grad}, - {{"filter_size", phi::vectorize(filter_dims)}, - {"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - phi::DenseTensor input_grad_tensor; - input_grad_tensor.ShareDataWith(*input_grad); - if (channel_last) { - input_grad_tensor.set_layout(DataLayout::kNHWC); - } - const auto& runner = NpuOpRunner("Conv2D", - {output_grad_tensor, *filter}, - {input_grad_tensor}, - {{"strides", strides_vec}, - {"pads", paddings}, - {"dilations", dilations_vec}, - {"groups", groups}, - {"data_format", data_format}}); - runner.Run(stream); - } - } -}; - -template -class Conv3DTransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* input = ctx.Input("Input"); - const phi::DenseTensor* filter = ctx.Input("Filter"); - phi::DenseTensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - std::vector output_padding = - ctx.Attr>("output_padding"); - const std::vector stride = ctx.Attr>("strides"); - std::vector padding = ctx.Attr>("paddings"); - std::vector dilation = ctx.Attr>("dilations"); - std::string data_format = ctx.Attr("data_format"); - int groups = ctx.Attr("groups"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - // check dimension - const bool channel_last = data_format == "NHWC"; - - if (data_format == "NHWC") { - data_format = "NDHWC"; - } else { - data_format = "NCDHW"; - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter->dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (channel_last) { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } - filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - phi::UpdatePaddingAndDilation( - &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize); - - // construct NPU attr - std::vector strides(5, 1); - std::vector dilations(5, 1); - - phi::DenseTensor input_tensor, output_tensor, filter_tensor; - input_tensor.Resize(input->dims()); - input_tensor.ShareDataWith(*input); - output_tensor.Resize(output->dims()); - output_tensor.ShareDataWith(*output); - filter_tensor.Resize(filter->dims()); - filter_tensor.ShareDataWith(*filter); - - PADDLE_ENFORCE_EQ( - dilation[0], - 1, - platform::errors::InvalidArgument( - "dilation[0] must be equal 1, but received %d.", dilation[0])); - - if (channel_last) { - input_tensor.set_layout(DataLayout::kNDHWC); - output_tensor.set_layout(DataLayout::kNDHWC); - strides[1] = stride[0]; - strides[2] = stride[1]; - strides[3] = stride[2]; - dilations[2] = dilation[1]; - dilations[3] = dilation[2]; - } else { - input_tensor.set_layout(DataLayout::kNCDHW); - output_tensor.set_layout(DataLayout::kNCDHW); - strides[2] = stride[0]; - strides[3] = stride[1]; - strides[4] = stride[2]; - dilations[3] = dilation[1]; - dilations[4] = dilation[2]; - } - filter_tensor.set_layout(DataLayout::kNCDHW); - - auto output_dim_vec = phi::vectorize(output_tensor.dims()); - - auto& dev_ctx = ctx.template device_context(); - - NpuOpRunner runner; - runner.SetType("Conv3DBackpropInputD") - .AddInput(filter_tensor) - .AddInput(input_tensor) - .AddAttr("input_size", output_dim_vec) - .AddAttr("strides", strides) - .AddAttr("pads", padding) - .AddAttr("dilations", dilations) - .AddAttr("groups", groups) - .AddAttr("data_format", data_format) - .AddOutput(output_tensor); - runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(conv2d_transpose, - ops::Conv2DTransposeNPUKernel, - ops::Conv2DTransposeNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad, - ops::Conv2DTransposeGradNPUKernel, - ops::Conv2DTransposeGradNPUKernel); - -REGISTER_OP_NPU_KERNEL(conv3d_transpose, - ops::Conv3DTransposeNPUKernel, - ops::Conv3DTransposeNPUKernel); diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc deleted file mode 100644 index 5aaa832ce3383..0000000000000 --- a/paddle/fluid/operators/crop_op_npu.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/crop_op.h" - -namespace paddle { -namespace operators { - -template -class CropNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - std::vector offset_list; - if (ctx.HasInput("Offsets")) { - auto* offsets_tensor = ctx.Input("Offsets"); - paddle::framework::TensorToVector( - *offsets_tensor, ctx.device_context(), &offset_list); - if (offset_list.empty()) { - offset_list.resize(x->dims().size(), 0); - } - } else { - auto res = ctx.Attr>("offsets"); - if (res.empty()) { - offset_list.resize(x->dims().size(), 0); - } else { - offset_list.insert(offset_list.end(), res.begin(), res.end()); - } - } - - PADDLE_ENFORCE_EQ( - static_cast(offset_list.size()), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape (%d) of CropOp's " - "'offset' attribute should be equal to the shape of dims " - "(%d) of the Input(X).", - offset_list.size(), - x->dims().size())); - - int axis_int = 0; - framework::NPUAttributeMap attr_input = {{"offsets", offset_list}, - {"axis", axis_int}}; - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - if (ctx.HasInput("Y")) { - auto* shape = ctx.Input("Y"); - PADDLE_ENFORCE_EQ(shape->dims().size(), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape of dims of (%d) of CropOp's " - "Input(shape) should be equal to the shape of dims " - "(%d) of the Input(X).", - shape->dims().size(), - x->dims().size())); - - // shape memory maybe have gc. - phi::DenseTensor tmp_shape(*shape); - tmp_shape.mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - auto shape_size = ctx.Attr>("shape"); - PADDLE_ENFORCE_EQ(shape_size.size(), - x->dims().size(), - platform::errors::InvalidArgument( - "The shape of dims of (%d) of CropOp's " - "Input(shape) should be equal to the shape of dims " - "(%d) of the Input(X).", - shape_size.size(), - x->dims().size())); - phi::DenseTensor tmp_shape(x->dtype()); - tmp_shape.Resize(phi::make_ddim(shape_size)); - tmp_shape.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - crop, - ops::CropNPUKernel, - ops::CropNPUKernel, - ops::CropNPUKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc deleted file mode 100644 index a5c77922054da..0000000000000 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -static void CumsumImp(const phi::DenseTensor& input, - phi::DenseTensor* output, - const framework::NPUAttributeMap& attr_input, - const framework::ExecutionContext& ctx) { - auto stream = - ctx.template device_context() - .stream(); - if (framework::TransToProtoVarType(input.dtype()) == - framework::proto::VarType::INT64) { - phi::DenseTensor tmp_input; - tmp_input.mutable_data(input.dims(), ctx.GetPlace()); - auto dst_acl_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type())); - const auto& cast_runner_1 = - NpuOpRunner("Cast", - {input}, - {tmp_input}, - {{"dst_type", static_cast(dst_acl_dtype)}}); - cast_runner_1.Run(stream); - - phi::DenseTensor tmp_output; - tmp_output.mutable_data(output->dims(), ctx.GetPlace()); - const auto& runner = - NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input); - runner.Run(stream); - - dst_acl_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(output->type())); - const auto& cast_runner_2 = - NpuOpRunner("Cast", - {tmp_output}, - {*output}, - {{"dst_type", static_cast(dst_acl_dtype)}}); - cast_runner_2.Run(stream); - } else { - const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input); - runner.Run(stream); - } -} - -template -class CumSumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - bool exclusive = ctx.Attr("exclusive"); - bool reverse = ctx.Attr("reverse"); - - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = { - {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}}; - - bool flatten = ctx.Attr("flatten"); - if (flatten) { - PADDLE_ENFORCE_EQ( - axis, - -1, - platform::errors::InvalidArgument( - "when flatten is true, attr axis must be default %d, but got %d", - -1, - axis)); - - phi::DenseTensor new_x(x->type()); - new_x.ShareDataWith(*x); - - new_x.Resize(phi::make_ddim({x->numel()})); - - CumsumImp(new_x, out, attr_input, ctx); - } else { - CumsumImp(*x, out, attr_input, ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - cumsum, - ops::CumSumNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::CumSumNPUKernel, -#endif - ops::CumSumNPUKernel, - ops::CumSumNPUKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc deleted file mode 100644 index 4170088faff18..0000000000000 --- a/paddle/fluid/operators/detection/box_coder_op_npu.cc +++ /dev/null @@ -1,448 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/impl/box_coder.h" - -namespace paddle { -namespace operators { - -template -struct BoxCoderFunction { - public: - explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}}); - runner.Run(stream); - return y; - } - phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}}); - runner.Run(stream); - return y; - } - phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) { - phi::DenseTensor z; - z.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {}); - runner.Run(stream); - return z; - } - phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - z.mutable_data(shape, place); - const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {}); - runner.Run(stream); - return z; - } - void DivWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - DivWithBroadCastVoid(x, y, shape, &z); - return z; - } - void MulWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - MulWithBroadCastVoid(x, y, shape, &z); - return z; - } - void AddWithBroadCastVoid(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape, - phi::DenseTensor* z) { - z->mutable_data(shape, place); - const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {}); - runner.Run(stream); - } - phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x, - const phi::DenseTensor& y, - const framework::DDim& shape) { - phi::DenseTensor z; - AddWithBroadCastVoid(x, y, shape, &z); - return z; - } - phi::DenseTensor Abs(const phi::DenseTensor& x) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Abs", {x}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Log(const phi::DenseTensor& x) { - phi::DenseTensor t_x_m1 = Adds(x, -1); - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Exp(const phi::DenseTensor& x) { - phi::DenseTensor y; - y.mutable_data(x.dims(), place); - const auto& runner = NpuOpRunner("Exp", {x}, {y}, {}); - runner.Run(stream); - return y; - } - phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) { - auto dim_x = x.dims(); - auto dim_y = y.dims(); - PADDLE_ENFORCE_EQ( - dim_x.size(), - 2, - platform::errors::InvalidArgument( - "x should be a 2-dim tensor, but got %d-dim.", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_y.size(), - 2, - platform::errors::InvalidArgument( - "y should be a 2-dim tensor, but got %d-dim.", dim_y.size())); - PADDLE_ENFORCE_EQ( - dim_x[1], - dim_y[0], - platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but " - "got dim_x[1] = %d, dim_y[0] = %d.", - dim_x[1], - dim_y[0])); - phi::DenseTensor z; - z.mutable_data({dim_x[0], dim_y[1]}, place); - const auto& runner = - NpuOpRunner("MatMul", - {x, y}, - {z}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner.Run(stream); - return z; - } - void ConcatVoid(const std::vector& inputs, - const framework::DDim& shape_out, - int axis, - phi::DenseTensor* output) { - output->mutable_data(shape_out, place); - std::vector names; - for (size_t i = 0; i < inputs.size(); i++) { - names.push_back("x" + std::to_string(i)); - } - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*output}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } - phi::DenseTensor Concat(const std::vector& inputs, - const framework::DDim& shape_out, - int axis) { - phi::DenseTensor output; - ConcatVoid(inputs, shape_out, axis, &output); - return output; - } - phi::DenseTensor Slice(const phi::DenseTensor& x, - const std::vector& offsets, - const std::vector& size, - const framework::DDim& shape) { - phi::DenseTensor y; - y.mutable_data(shape, place); - const auto& runner = - NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - return y; - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -void Vector2Tensor(const framework::ExecutionContext& ctx, - const std::vector& vec, - const framework::DDim& ddim, - phi::DenseTensor* tsr) { - framework::TensorFromVector(vec, ctx.device_context(), tsr); - ctx.template device_context().Wait(); - tsr->Resize(ddim); -} - -template -void BoxCoderEnc(const framework::ExecutionContext& ctx, - const phi::DenseTensor* tb, - const phi::DenseTensor* pb, - const phi::DenseTensor* pbv, - const bool norm, - const std::vector& variance, - phi::DenseTensor* out) { - auto M = pb->dims()[0]; - auto N = tb->dims()[0]; - auto shape_0 = phi::make_ddim({4, 2}); - phi::DenseTensor m_diff; - phi::DenseTensor m_aver; - std::vector vec_diff = {static_cast(-1), - static_cast(0), - static_cast(0), - static_cast(-1), - static_cast(1), - static_cast(0), - static_cast(0), - static_cast(1)}; - std::vector vec_aver = {static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5), - static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5)}; - Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); - Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); - - BoxCoderFunction F(ctx); - phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); - phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); - phi::DenseTensor tb_xy = F.Dot(*tb, m_aver); - phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1)); - - pb_xy.Resize({1, M, 2}); - pb_wh.Resize({1, M, 2}); - tb_xy.Resize({N, 1, 2}); - tb_wh.Resize({N, 1, 2}); - - auto shape_half = phi::make_ddim({N, M, 2}); - auto shape_full = phi::make_ddim({N, M, 4}); - - phi::DenseTensor out_xy_0 = F.DivWithBroadCast( - F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half); - phi::DenseTensor out_wh_0 = - F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half))); - phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2); - - if (pbv) { - F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out); - } else { - phi::DenseTensor t_var; - std::vector vec_var(4); - for (auto i = 0; i < 4; i++) { - vec_var[i] = static_cast(variance[i]); - } - Vector2Tensor(ctx, vec_var, phi::make_ddim({1, 1, 4}), &t_var); - F.DivWithBroadCastVoid(out_0, t_var, shape_full, out); - } -} - -template -void BoxCoderDec(const framework::ExecutionContext& ctx, - const phi::DenseTensor* tb, - const phi::DenseTensor* pb, - const phi::DenseTensor* pbv, - const bool norm, - const std::vector& variance, - int axis, - phi::DenseTensor* out) { - auto shape_0 = phi::make_ddim({4, 2}); - phi::DenseTensor m_diff; - phi::DenseTensor m_aver; - std::vector vec_diff = {static_cast(-1), - static_cast(0), - static_cast(0), - static_cast(-1), - static_cast(1), - static_cast(0), - static_cast(0), - static_cast(1)}; - std::vector vec_aver = {static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5), - static_cast(0.5), - static_cast(0), - static_cast(0), - static_cast(0.5)}; - Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); - Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); - - BoxCoderFunction F(ctx); - phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); - phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); - auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2}) - : phi::make_ddim({pb->dims()[0], 1, 2}); - pb_xy.Resize(pb_resize_shape); - pb_wh.Resize(pb_resize_shape); - - auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2}); - std::vector tbox_slice_size = { - static_cast(tb->dims()[0]), static_cast(tb->dims()[1]), 2}; - phi::DenseTensor tbox01 = - F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape); - phi::DenseTensor tbox23 = - F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape); - - phi::DenseTensor tb_xy; - phi::DenseTensor tb_wh; - if (pbv) { - auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2}); - auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2}) - : phi::make_ddim({pbv->dims()[0], 1, 2}); - std::vector pbvt_slice_size = {static_cast(pbv->dims()[0]), 2}; - phi::DenseTensor pbv_t01 = - F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape); - phi::DenseTensor pbv_t23 = - F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape); - pbv_t01.Resize(pbvt_resize_shape); - pbv_t23.Resize(pbvt_resize_shape); - - F.AddWithBroadCastVoid( - F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid( - F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), - pb_wh, - tbox_slice_shape, - &tb_wh); - } else if (variance.empty()) { - F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh); - } else { - phi::DenseTensor t_var01, t_var23; - auto t_var_shape = phi::make_ddim({1, 1, 2}); - std::vector vec_var01 = {static_cast(variance[0]), - static_cast(variance[1])}; - std::vector vec_var23 = {static_cast(variance[2]), - static_cast(variance[3])}; - Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01); - Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23); - F.AddWithBroadCastVoid( - F.MulWithBroadCast(tbox01, - F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape), - tbox_slice_shape), - pb_xy, - tbox_slice_shape, - &tb_xy); - F.MulWithBroadCastVoid( - F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), - pb_wh, - tbox_slice_shape, - &tb_wh); - } - phi::DenseTensor obox01 = - F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape); - phi::DenseTensor obox23 = - F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape), - (norm ? 0 : -1)); - F.ConcatVoid({obox01, obox23}, out->dims(), 2, out); -} - -template -class BoxCoderNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* prior_box = ctx.Input("PriorBox"); - auto* prior_box_var = ctx.Input("PriorBoxVar"); - auto* target_box = ctx.Input("TargetBox"); - auto* output_box = ctx.Output("OutputBox"); - std::vector variance = ctx.Attr>("variance"); - const int axis = ctx.Attr("axis"); - - if (prior_box_var) { - PADDLE_ENFORCE_EQ(variance.empty(), - true, - platform::errors::InvalidArgument( - "Input 'PriorBoxVar' and attribute 'variance'" - " of BoxCoder operator should not be used at the " - "same time.")); - } - if (!(variance.empty())) { - PADDLE_ENFORCE_EQ(static_cast(variance.size()), - 4, - platform::errors::InvalidArgument( - "Size of attribute 'variance' in BoxCoder operator" - " should be 4. But received size is %d", - variance.size())); - } - - if (target_box->lod().size()) { - PADDLE_ENFORCE_EQ(target_box->lod().size(), - 1, - platform::errors::InvalidArgument( - "Input 'TargetBox' of BoxCoder operator only" - " supports LoD with one level.")); - } - - auto code_type = - phi::funcs::GetBoxCodeType(ctx.Attr("code_type")); - bool normalized = ctx.Attr("box_normalized"); - - if (code_type == phi::funcs::BoxCodeType::kEncodeCenterSize) { - BoxCoderEnc(ctx, - target_box, - prior_box, - prior_box_var, - normalized, - variance, - output_box); - } else { - BoxCoderDec(ctx, - target_box, - prior_box, - prior_box_var, - normalized, - variance, - axis, - output_box); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(box_coder, - ops::BoxCoderNPUKernel, - ops::BoxCoderNPUKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc deleted file mode 100644 index c9935e54d82ef..0000000000000 --- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc +++ /dev/null @@ -1,396 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/density_prior_box_op.h" - -namespace paddle { -namespace operators { - -using fp16 = paddle::platform::float16; - -template -struct DensityPriorBoxFunction { - public: - explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context().stream(); - t0.mutable_data({1}, place); - t1.mutable_data({1}, place); - tn.mutable_data({1}, place); - FillNpuTensorWithConstant(&t0, static_cast(0)); - FillNpuTensorWithConstant(&t1, static_cast(1)); - } - void Arange(int n, phi::DenseTensor* x) { - // x should be init first - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(y->type())); - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Concat(const std::vector& inputs, - int axis, - phi::DenseTensor* output) { - // output should be init first - std::vector names; - for (size_t i = 0; i < inputs.size(); i++) { - names.push_back("x" + std::to_string(i)); - } - NpuOpRunner runner{ - "ConcatD", - {inputs}, - {*output}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; - runner.AddInputNames(names); - runner.Run(stream); - } - void Tile(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& multiples) { - // y should be init first - if (x->dims() == y->dims()) { - framework::TensorCopy( - *x, - place, - ctx.template device_context(), - y); - return; - } - const auto& runner = - NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}}); - runner.Run(stream); - } - void FloatVec2Tsr(const std::vector& vec, phi::DenseTensor* tsr_dst) { - // - framework::TensorFromVector(vec, ctx.device_context(), tsr_dst); - ctx.template device_context().Wait(); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; - phi::DenseTensor t0; - phi::DenseTensor t1; - phi::DenseTensor tn; -}; - -template <> -void DensityPriorBoxFunction::Arange(int n, phi::DenseTensor* x) { - phi::DenseTensor x_fp32(phi::DataType::FLOAT32); - x_fp32.mutable_data(x->dims(), place); - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); - runner.Run(stream); - Cast(&x_fp32, x); -} - -template <> -void DensityPriorBoxFunction::FloatVec2Tsr(const std::vector& vec, - phi::DenseTensor* tsr_dst) { - phi::DenseTensor tsr_fp32(phi::DataType::FLOAT32); - tsr_fp32.mutable_data(tsr_dst->dims(), place); - framework::TensorFromVector(vec, ctx.device_context(), &tsr_fp32); - ctx.template device_context().Wait(); - Cast(&tsr_fp32, tsr_dst); -} - -template -class DensityPriorBoxOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* image = ctx.Input("Image"); - auto* boxes = ctx.Output("Boxes"); - auto* vars = ctx.Output("Variances"); - - auto variances = ctx.Attr>("variances"); - auto clip = ctx.Attr("clip"); - - auto fixed_sizes = ctx.Attr>("fixed_sizes"); - auto fixed_ratios = ctx.Attr>("fixed_ratios"); - auto densities = ctx.Attr>("densities"); - - float step_w = ctx.Attr("step_w"); - float step_h = ctx.Attr("step_h"); - float offset = ctx.Attr("offset"); - - int image_w = image->dims()[3]; - int image_h = image->dims()[2]; - int layer_w = input->dims()[3]; - int layer_h = input->dims()[2]; - - auto _type = input->dtype(); - auto place = ctx.GetPlace(); - DensityPriorBoxFunction F(ctx); - - phi::DenseTensor h(_type); - h.mutable_data({layer_h}, place); - phi::DenseTensor w(_type); - w.mutable_data({layer_w}, place); - F.Arange(layer_h, &h); - F.Arange(layer_w, &w); - h.Resize({layer_h, 1, 1, 1}); - w.Resize({1, layer_w, 1, 1}); - - step_w = step_w > 0 ? step_w : static_cast(image_w) / layer_w; - step_h = step_h > 0 ? step_h : static_cast(image_h) / layer_h; - int step_average = static_cast((step_w + step_h) * 0.5); - - int ratios_size = fixed_ratios.size(); - int num_priors_per_ratio = 0; - for (size_t i = 0; i < densities.size(); ++i) { - num_priors_per_ratio += densities[i] * densities[i]; - } - phi::DenseTensor di(_type); - phi::DenseTensor dj(_type); - phi::DenseTensor shifts(_type); - phi::DenseTensor box_w_ratio(_type); - phi::DenseTensor box_h_ratio(_type); - di.mutable_data({ratios_size * num_priors_per_ratio}, place); - dj.mutable_data({ratios_size * num_priors_per_ratio}, place); - shifts.mutable_data({ratios_size * num_priors_per_ratio}, place); - box_w_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); - box_h_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); - - int64_t start = 0; - std::vector vec_tile = {0, 0, 0}; - for (size_t i = 0; i < densities.size(); ++i) { - // Range = start:start+ratios_size*density_sqr, density = densities[i] - int density_sqr = densities[i] * densities[i]; - // shifts[Range] = [step_average/density]*ratios_size*density_sqr - phi::DenseTensor shifts_part = - shifts.Slice(start, start + ratios_size * density_sqr); - FillNpuTensorWithConstant(&shifts_part, - static_cast(step_average / densities[i])); - - // di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size - // dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size - phi::DenseTensor di_part = - di.Slice(start, start + ratios_size * density_sqr); - phi::DenseTensor dj_part = - dj.Slice(start, start + ratios_size * density_sqr); - if (densities[i] > 1) { - di_part.Resize({ratios_size, densities[i], densities[i]}); - dj_part.Resize({ratios_size, densities[i], densities[i]}); - phi::DenseTensor range_n(_type); - range_n.mutable_data({densities[i]}, place); - F.Arange(densities[i], &range_n); - range_n.Resize({1, densities[i], 1}); - vec_tile[0] = ratios_size; - vec_tile[1] = 1; - vec_tile[2] = densities[i]; - F.Tile(&range_n, &di_part, vec_tile); - range_n.Resize({1, 1, densities[i]}); - vec_tile[1] = densities[i]; - vec_tile[2] = 1; - F.Tile(&range_n, &dj_part, vec_tile); - } else { - FillNpuTensorWithConstant(&di_part, static_cast(0)); - FillNpuTensorWithConstant(&dj_part, static_cast(0)); - } - - int start_box_ratio = start; - for (float ar : fixed_ratios) { - // Range_mini = start_box_ratio:start_box_ratio+density_sqr - // box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)] * density_sqr - // box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)] * density_sqr - phi::DenseTensor box_h_ratio_part = - box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); - phi::DenseTensor box_w_ratio_part = - box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); - FillNpuTensorWithConstant(&box_w_ratio_part, - static_cast(fixed_sizes[i] * sqrt(ar))); - FillNpuTensorWithConstant(&box_h_ratio_part, - static_cast(fixed_sizes[i] / sqrt(ar))); - start_box_ratio += density_sqr; - } - start = start_box_ratio; - } - di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); - - // c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts - // c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts - phi::DenseTensor c_x(_type); - phi::DenseTensor c_y(_type); - auto dim0 = - phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1}); - auto dim1 = - phi::make_ddim({layer_h, 1, ratios_size * num_priors_per_ratio, 1}); - c_x.mutable_data(dim0, place); - c_y.mutable_data(dim1, place); - F.Adds(&w, offset, &w); - F.Muls(&w, step_w, &w); - F.Adds(&w, static_cast(-step_average) * static_cast(0.5), &w); - F.Adds(&h, offset, &h); - F.Muls(&h, step_h, &h); - F.Adds(&h, static_cast(-step_average) * static_cast(0.5), &h); - F.Mul(&di, &shifts, &di); - F.Mul(&dj, &shifts, &dj); - F.Muls(&shifts, static_cast(0.5), &shifts); - F.Add(&di, &shifts, &di); - F.Add(&dj, &shifts, &dj); - F.Add(&dj, &w, &c_x); - F.Add(&di, &h, &c_y); - - // box_w_ratio = box_w_ratio / 2 - // box_h_ratio = box_h_ratio / 2 - F.Muls(&box_w_ratio, static_cast(0.5), &box_w_ratio); - F.Muls(&box_h_ratio, static_cast(0.5), &box_h_ratio); - - phi::DenseTensor zero_t(_type); - phi::DenseTensor one_t(_type); - zero_t.mutable_data({1}, place); - one_t.mutable_data({1}, place); - FillNpuTensorWithConstant(&zero_t, static_cast(0)); - FillNpuTensorWithConstant(&one_t, static_cast(1)); - - phi::DenseTensor outbox0(_type); - phi::DenseTensor outbox1(_type); - phi::DenseTensor outbox2(_type); - phi::DenseTensor outbox3(_type); - outbox0.mutable_data(dim0, place); - outbox1.mutable_data(dim1, place); - outbox2.mutable_data(dim0, place); - outbox3.mutable_data(dim1, place); - - // outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 ) - // outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 ) - // outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 ) - // outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 ) - F.Sub(&c_x, &box_w_ratio, &outbox0); - F.Sub(&c_y, &box_h_ratio, &outbox1); - F.Add(&c_x, &box_w_ratio, &outbox2); - F.Add(&c_y, &box_h_ratio, &outbox3); - F.Muls(&outbox0, static_cast(1.0 / image_w), &outbox0); - F.Muls(&outbox1, static_cast(1.0 / image_h), &outbox1); - F.Muls(&outbox2, static_cast(1.0 / image_w), &outbox2); - F.Muls(&outbox3, static_cast(1.0 / image_h), &outbox3); - - F.Maximum(&outbox0, &zero_t, &outbox0); - F.Maximum(&outbox1, &zero_t, &outbox1); - F.Minimum(&outbox2, &one_t, &outbox2); - F.Minimum(&outbox3, &one_t, &outbox3); - if (clip) { - // outbox0 = min ( outbox0, 1 ) - // outbox1 = min ( outbox1, 1 ) - // outbox2 = max ( outbox2, 0 ) - // outbox3 = max ( outbox3, 0 ) - F.Minimum(&outbox0, &one_t, &outbox0); - F.Minimum(&outbox1, &one_t, &outbox1); - F.Maximum(&outbox2, &zero_t, &outbox2); - F.Maximum(&outbox3, &zero_t, &outbox3); - } - - auto out_dim = phi::make_ddim( - {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4}); - boxes->mutable_data(place); - vars->mutable_data(place); - phi::DenseTensor boxes_share(_type); - phi::DenseTensor vars_share(_type); - boxes_share.ShareDataWith(*boxes); - boxes_share.Resize(out_dim); - vars_share.ShareDataWith(*vars); - vars_share.Resize(out_dim); - - phi::DenseTensor box0(_type); - phi::DenseTensor box1(_type); - phi::DenseTensor box2(_type); - phi::DenseTensor box3(_type); - // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1} - out_dim[3] = 1; - box0.mutable_data(out_dim, place); - box1.mutable_data(out_dim, place); - box2.mutable_data(out_dim, place); - box3.mutable_data(out_dim, place); - - std::vector vec_exp_out02 = {layer_h, 1, 1, 1}; - std::vector vec_exp_out13 = {1, layer_w, 1, 1}; - F.Tile(&outbox0, &box0, vec_exp_out02); - F.Tile(&outbox1, &box1, vec_exp_out13); - F.Tile(&outbox2, &box2, vec_exp_out02); - F.Tile(&outbox3, &box3, vec_exp_out13); - F.Concat({box0, box1, box2, box3}, 3, &boxes_share); - - std::vector multiples = { - layer_h, layer_w, ratios_size * num_priors_per_ratio, 1}; - phi::DenseTensor variances_t(_type); - // variances.size() == 4 - variances_t.mutable_data({4}, place); - F.FloatVec2Tsr(variances, &variances_t); - F.Tile(&variances_t, &vars_share, multiples); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(density_prior_box, - ops::DensityPriorBoxOpNPUKernel, - ops::DensityPriorBoxOpNPUKernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc deleted file mode 100644 index 8395e25d46251..0000000000000 --- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc +++ /dev/null @@ -1,204 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/iou_similarity_op.h" - -namespace paddle { -namespace operators { - -template -struct IouFunction { - public: - explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - // y should be init first - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void DivNoNan(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // z should be init first - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -class IouSimilarityNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - bool normalized = ctx.Attr("box_normalized"); - auto* out = ctx.Output("Out"); - - auto _type = x->dtype(); - auto place = ctx.GetPlace(); - - IouFunction F(ctx); - - auto N = x->dims()[0]; - auto M = y->dims()[0]; - - out->mutable_data({N, M}, place); - phi::DenseTensor xt(_type); - phi::DenseTensor yt(_type); - xt.mutable_data({4, N}, place); - yt.mutable_data({4, M}, place); - std::vector vec_trans = {1, 0}; - F.Transpose(x, &xt, vec_trans); - F.Transpose(y, &yt, vec_trans); - phi::DenseTensor xmin1 = xt.Slice(0, 1); - phi::DenseTensor ymin1 = xt.Slice(1, 2); - phi::DenseTensor xmax1 = xt.Slice(2, 3); - phi::DenseTensor ymax1 = xt.Slice(3, 4); - phi::DenseTensor xmin2 = yt.Slice(0, 1); - phi::DenseTensor ymin2 = yt.Slice(1, 2); - phi::DenseTensor xmax2 = yt.Slice(2, 3); - phi::DenseTensor ymax2 = yt.Slice(3, 4); - xmin1.Resize({N, 1}); - ymin1.Resize({N, 1}); - xmax1.Resize({N, 1}); - ymax1.Resize({N, 1}); - xmin2.Resize({1, M}); - ymin2.Resize({1, M}); - xmax2.Resize({1, M}); - ymax2.Resize({1, M}); - - phi::DenseTensor w1(_type); - phi::DenseTensor h1(_type); - phi::DenseTensor w2(_type); - phi::DenseTensor h2(_type); - phi::DenseTensor area1(_type); - phi::DenseTensor area2(_type); - w1.mutable_data({N, 1}, place); - h1.mutable_data({N, 1}, place); - w2.mutable_data({1, M}, place); - h2.mutable_data({1, M}, place); - area1.mutable_data({N, 1}, place); - area2.mutable_data({1, M}, place); - F.Sub(&xmax1, &xmin1, &w1); - F.Sub(&ymax1, &ymin1, &h1); - F.Sub(&xmax2, &xmin2, &w2); - F.Sub(&ymax2, &ymin2, &h2); - if (!normalized) { - F.Adds(&w1, 1.0f, &w1); - F.Adds(&h1, 1.0f, &h1); - F.Adds(&w2, 1.0f, &w2); - F.Adds(&h2, 1.0f, &h2); - } - F.Mul(&w1, &h1, &area1); - F.Mul(&w2, &h2, &area2); - - phi::DenseTensor inter_xmax(_type); - phi::DenseTensor inter_ymax(_type); - phi::DenseTensor inter_xmin(_type); - phi::DenseTensor inter_ymin(_type); - inter_xmax.mutable_data({N, M}, place); - inter_ymax.mutable_data({N, M}, place); - inter_xmin.mutable_data({N, M}, place); - inter_ymin.mutable_data({N, M}, place); - F.Minimum(&xmax1, &xmax2, &inter_xmax); - F.Minimum(&ymax1, &ymax2, &inter_ymax); - F.Maximum(&xmin1, &xmin2, &inter_xmin); - F.Maximum(&ymin1, &ymin2, &inter_ymin); - - phi::DenseTensor inter_w(_type); - phi::DenseTensor inter_h(_type); - inter_w.mutable_data({N, M}, place); - inter_h.mutable_data({N, M}, place); - F.Sub(&inter_xmax, &inter_xmin, &inter_w); - F.Sub(&inter_ymax, &inter_ymin, &inter_h); - - if (!normalized) { - F.Adds(&inter_w, 1.0f, &inter_w); - F.Adds(&inter_h, 1.0f, &inter_h); - } - phi::DenseTensor zeros(_type); - zeros.mutable_data({1}, place); - FillNpuTensorWithConstant(&zeros, static_cast(0)); - F.Maximum(&inter_w, &zeros, &inter_w); - F.Maximum(&inter_h, &zeros, &inter_h); - - F.Mul(&inter_w, &inter_h, out); - phi::DenseTensor union_area(_type); - union_area.mutable_data({N, M}, place); - F.Add(&area1, &area2, &union_area); - F.Sub(&union_area, out, &union_area); - F.DivNoNan(out, &union_area, out); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(iou_similarity, - ops::IouSimilarityNPUKernel, - ops::IouSimilarityNPUKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc deleted file mode 100644 index 7df68d2bbb1bb..0000000000000 --- a/paddle/fluid/operators/detection/prior_box_op_npu.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/detection/prior_box_op.h" - -namespace paddle { -namespace operators { - -template -class PriorBoxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* image = ctx.Input("Image"); - auto* boxes = ctx.Output("Boxes"); - auto* variances = ctx.Output("Variances"); - - PADDLE_ENFORCE_EQ(boxes->dims(), - variances->dims(), - platform::errors::Unimplemented( - "the shape of boxes and variances must be same in " - "the npu kernel of prior_box, but got boxes->dims() " - "= [%s], variances->dims() = [%s]", - boxes->dims(), - variances->dims())); - - auto min_sizes = ctx.Attr>("min_sizes"); - auto max_sizes = ctx.Attr>("max_sizes"); - auto aspect_ratios = ctx.Attr>("aspect_ratios"); - auto variances_attr = ctx.Attr>("variances"); - bool flip = ctx.Attr("flip"); - bool clip = ctx.Attr("clip"); - float step_w = ctx.Attr("step_w"); - float step_h = ctx.Attr("step_h"); - float offset = ctx.Attr("offset"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor out(input->type()); - auto out_dims = phi::vectorize(boxes->dims()); - out_dims.insert(out_dims.begin(), 2); - out.Resize(phi::make_ddim(out_dims)); - out.mutable_data(place); - - framework::NPUAttributeMap attr_input = {{"min_size", min_sizes}, - {"max_size", max_sizes}, - {"aspect_ratio", aspect_ratios}, - {"step_h", step_h}, - {"step_w", step_w}, - {"flip", flip}, - {"clip", clip}, - {"offset", offset}, - {"variance", variances_attr}}; - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("PriorBox", {*input, *image}, {out}, attr_input); - runner.Run(stream); - - out.Resize(phi::make_ddim({out.numel()})); - phi::DenseTensor out_boxes = out.Slice(0, boxes->numel()); - phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel()); - - out_boxes.Resize(boxes->dims()); - out_variances.Resize(variances->dims()); - - boxes->mutable_data(place); - variances->mutable_data(place); - - framework::TensorCopy( - out_boxes, - place, - ctx.template device_context(), - boxes); - framework::TensorCopy( - out_variances, - place, - ctx.template device_context(), - variances); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - prior_box, - ops::PriorBoxNPUKernel, - ops::PriorBoxNPUKernel); diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc deleted file mode 100644 index 9c84961f611c0..0000000000000 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ /dev/null @@ -1,212 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class DropoutNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* seed_tensor = - ctx.HasInput("Seed") ? ctx.Input("Seed") : nullptr; - auto* out = ctx.Output("Out"); - auto* mask = ctx.Output("Mask"); - - auto dropout_prob = ctx.Attr("dropout_prob"); - auto is_test = ctx.Attr("is_test"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - if (dropout_prob == 1.) { - const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out}); - runner_zeros_out.Run(stream); - mask->mutable_data(ctx.GetPlace()); - const auto& runner_zeros_mask = - NpuOpRunner("ZerosLike", {*mask}, {*mask}); - runner_zeros_mask.Run(stream); - return; - } - - // only achieve the default `upscale_in_train` method - if (!is_test) { - phi::DenseTensor tmp_x(x->dtype()); - phi::DenseTensor tmp_out(out->dtype()); - tmp_x.ShareDataWith(*x); - tmp_out.ShareDataWith(*out); - if (x->dims().size() == 1) { - // DropOutDoMask will get error result when input - // is 1-D. Make it become 2-D. - std::vector vec_dim = phi::vectorize(x->dims()); - tmp_x.Resize(phi::make_ddim({vec_dim[0], 1})); - tmp_out.Resize(phi::make_ddim({vec_dim[0], 1})); - } - - int seed = 0; - int seed2 = 0; - float keep_prob = 1. - dropout_prob; - if (seed_tensor) { - std::vector seed_data; - paddle::framework::TensorToVector( - *seed_tensor, ctx.device_context(), &seed_data); - seed = seed_data[0]; - } else { - seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : 0; - } - - phi::DenseTensor keep_prob_tensor(x->dtype()); - keep_prob_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&keep_prob_tensor, - static_cast(keep_prob)); - - mask->mutable_data(ctx.GetPlace()); - - // mask used in `DropOutGenMask` NPU OP is different from - // the output `Mask`. - phi::DenseTensor npu_mask(phi::DataType::UINT8); - uint32_t length = (x->numel() + 128 - 1) / 128 * 128; - npu_mask.Resize(phi::make_ddim({length / 8})); - npu_mask.mutable_data(ctx.GetPlace()); - - // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU - // OP must be a scalar with shape[0]. At present, the shape - // of the `prob` phi::DenseTensor of this OP is forced to be set to 0 - // in `npu_op_runner.cc`, which needs to be optimized later. - NpuOpRunner runner_gen_mask; - runner_gen_mask.SetType("DropOutGenMask") - .AddInput(phi::vectorize(tmp_out.dims())) - .AddInput(keep_prob_tensor) - .AddOutput(npu_mask) - .AddAttr("seed", seed) - .AddAttr("seed2", seed2); - runner_gen_mask.Run(stream); - - NpuOpRunner runner_dropout; - runner_dropout.SetType("DropOutDoMask") - .AddInput(tmp_x) - .AddInput(npu_mask) - .AddInput(keep_prob_tensor) - .AddOutput(tmp_out); - runner_dropout.Run(stream); - - // cast `out` from float/float16 to bool - phi::DenseTensor cast_mask(phi::DataType::BOOL); - cast_mask.Resize(mask->dims()); - cast_mask.mutable_data(ctx.GetPlace()); - auto dst_dtype_bool = - ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype())); - const auto& runner_cast_mask_bool = - NpuOpRunner("Cast", - {*out}, - {cast_mask}, - {{"dst_type", static_cast(dst_dtype_bool)}}); - runner_cast_mask_bool.Run(stream); - - // cast cast_mask from bool to uint8 - auto dst_dtype_uint8 = - ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype())); - const auto& runner_cast_mask_uint8 = - NpuOpRunner("Cast", - {cast_mask}, - {*mask}, - {{"dst_type", static_cast(dst_dtype_uint8)}}); - runner_cast_mask_uint8.Run(stream); - } else { - framework::TensorCopy( - *x, - ctx.GetPlace(), - ctx.template device_context(), - out); - } - } -}; - -template -class DropoutGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* mask = ctx.Input("Mask"); - - auto dropout_prob = ctx.Attr("dropout_prob"); - auto is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ(is_test, - false, - platform::errors::PreconditionNotMet( - "GradOp is only callable when is_test is false")); - - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - if (dropout_prob == 1.) { - const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx}); - runner_zeros.Run(stream); - return; - } - - // cast mask from uint8 to float32/float16 - phi::DenseTensor cast_mask(dx->dtype()); - cast_mask.Resize(mask->dims()); - cast_mask.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype())); - const auto& runner_cast_mask = - NpuOpRunner("Cast", - {*mask}, - {cast_mask}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mask.Run(stream); - - const auto& runner = - NpuOpRunner("MaskedScale", - {*dout, cast_mask}, - {*dx}, - {{"value", static_cast(1. / (1 - dropout_prob))}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - dropout, - ops::DropoutNPUKernel, - ops::DropoutNPUKernel); - -REGISTER_OP_NPU_KERNEL( - dropout_grad, - ops::DropoutGradNPUKernel, - ops::DropoutGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt index 25b34a2c0a2c3..e0714041dfabe 100644 --- a/paddle/fluid/operators/elementwise/CMakeLists.txt +++ b/paddle/fluid/operators/elementwise/CMakeLists.txt @@ -17,16 +17,3 @@ cc_test( test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) - -if(WITH_ASCEND_CL) - cc_test( - elementwise_op_npu_test - SRCS elementwise_op_npu_test.cc - DEPS op_registry - elementwise_add_op - elementwise_sub_op - scope - device_context - enforce - executor) -endif() diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc deleted file mode 100644 index 2ae45d5973d2a..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseAddNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int axis = ctx.Attr("axis"); - - bool direct_compute = false; - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - - if (x_dims.size() == y_dims.size()) { - direct_compute = true; - } else if (x_dims.size() > y_dims.size()) { - direct_compute = x_dims.size() == (y_dims.size() + axis); - } else { - direct_compute = y_dims.size() == (x_dims.size() + axis); - } - - if (direct_compute) { - const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); - runner.Run(dev_ctx.stream()); - } else { - phi::DenseTensor transformed_x, transformed_y; - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - const auto& runner = - NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {}); - runner.Run(dev_ctx.stream()); - } - } -}; - -template -class ElementwiseAddGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); - auto stream = dev_ctx.stream(); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - if (dx->dims() != dout->dims()) { - std::vector dst_dims_vec; - std::vector reduce_axes; - auto src_dims = dx->dims(); - auto dout_dims = dout->dims(); - - int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + src_dims.size()) || - (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } else { - dst_dims_vec.push_back(dout_dims[ax]); - } - } - if (!reduce_axes.empty()) { - phi::DenseTensor tmp; - tmp.ShareDataWith(*dx); - tmp.Resize(phi::make_ddim(dst_dims_vec)); - const auto& runner = - NpuOpRunner("ReduceSumD", - {*dout}, - {tmp}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx); - } - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if (dy->dims() != dout->dims()) { - std::vector dst_dims_vec; - std::vector reduce_axes; - auto src_dims = dy->dims(); - auto dout_dims = dout->dims(); - - int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + src_dims.size()) || - (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } else { - dst_dims_vec.push_back(dout_dims[ax]); - } - } - if (!reduce_axes.empty()) { - phi::DenseTensor tmp; - tmp.ShareDataWith(*dy); - tmp.Resize(phi::make_ddim(dst_dims_vec)); - const auto& runner = - NpuOpRunner("ReduceSumD", - {*dout}, - {tmp}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(elementwise_add, - ops::ElementwiseAddNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ElementwiseAddNPUKernel, -#endif - ops::ElementwiseAddNPUKernel); - -REGISTER_OP_NPU_KERNEL(elementwise_add_grad, - ops::ElementwiseAddGradNPUKernel, - ops::ElementwiseAddGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc deleted file mode 100644 index 259a517a2d32d..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseDivNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class ElementwiseDivGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - if (dx) { - dx->mutable_data(place); - - phi::DenseTensor tensor_one(y->type()); - tensor_one.mutable_data({1}, place); - FillNpuTensorWithConstant(&tensor_one, static_cast(1.0)); - - // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP. - // Because `Power` will cause precision overflow, that is, `float_status` - // will be set to 1. - phi::DenseTensor y_div(y->type()); - y_div.mutable_data(y->dims(), place); - const auto& runner_one_div_y = - NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {}); - runner_one_div_y.Run(stream); - - phi::DenseTensor tensor_zeros(x->type()); - tensor_zeros.mutable_data(x->dims(), place); - const auto& runner_tensor_zeros = - NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {}); - runner_tensor_zeros.Run(stream); - - phi::DenseTensor x_zero(phi::DataType::BOOL); - x_zero.mutable_data(x->dims(), place); - const auto& runner_x_zero = - NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {}); - runner_x_zero.Run(stream); - - phi::DenseTensor x_nozero(phi::DataType::BOOL); - x_nozero.mutable_data(x->dims(), place); - const auto& runner_x_nonzero = - NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {}); - runner_x_nonzero.Run(stream); - - phi::DenseTensor x_nozero_f(x->type()); - x_nozero_f.mutable_data(x->dims(), place); - const auto& runner_x_nonzero_f = - NpuOpRunner("Cast", - {x_nozero}, - {x_nozero_f}, - {{"dst_type", static_cast(0)}}); - runner_x_nonzero_f.Run(stream); - - phi::DenseTensor x_grad_w(x->type()); - x_grad_w.mutable_data(x->dims(), place); - const auto& runner_x_grad_w = - NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {}); - runner_x_grad_w.Run(stream); - - const auto& runner_x_grad = - NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {}); - runner_x_grad.Run(stream); - } - - if (dy) { - dy->mutable_data(place); - - phi::DenseTensor neg_out(out->type()); - neg_out.mutable_data(out->dims(), place); - const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {}); - runner_neg_out.Run(stream); - - phi::DenseTensor tmp_mul(out->type()); - tmp_mul.mutable_data(out->dims(), place); - const auto& runner_mul = - NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {}); - runner_mul.Run(stream); - - if (dy->dims() != dout->dims()) { - phi::DenseTensor reduced_tmp_mul(y->type()); - reduced_tmp_mul.mutable_data(y->dims(), place); - - std::vector axes; - int64_t diff = dout->dims().size() - dy->dims().size(); - for (int64_t i = 0; i < dout->dims().size(); ++i) { - if (i < diff) { - axes.push_back(i); - continue; - } - if (dout->dims()[i] > dy->dims()[i - diff]) { - axes.push_back(i); - } - } - const auto& runner_reduce = - NpuOpRunner("ReduceSumD", - {tmp_mul}, - {reduced_tmp_mul}, - {{"axes", axes}, {"keep_dims", false}}); - runner_reduce.Run(stream); - - const auto& runner_y_grad = - NpuOpRunner("Div", {reduced_tmp_mul, *y}, {*dy}, {}); - runner_y_grad.Run(stream); - } else { - const auto& runner_y_grad = - NpuOpRunner("Div", {tmp_mul, *y}, {*dy}, {}); - runner_y_grad.Run(stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - elementwise_div, - ops::ElementwiseDivNPUKernel, - ops::ElementwiseDivNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradNPUKernel, - ops::ElementwiseDivGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc deleted file mode 100644 index 791c352157781..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseFloorDivNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(elementwise_floordiv, - ops::ElementwiseFloorDivNPUKernel, - ops::ElementwiseFloorDivNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc deleted file mode 100644 index 1f3c6229c1854..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc +++ /dev/null @@ -1,251 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - - bool direct_compute = false; - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - if (x_dims.size() >= y_dims.size()) { - direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size()); - } else { - direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size()); - } - - auto stream = - ctx.template device_context() - .stream(); - - if (direct_compute) { - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {}); - runner.Run(stream); - } else { - phi::DenseTensor transformed_x, transformed_y; - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - const auto& runner = - NpuOpRunner("Maximum", {transformed_x, transformed_y}, {*out}, {}); - runner.Run(stream); - } - } -}; - -template -class ElementwiseMaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - // The ascend elementwise_max_grad op only supports broadcast - // when axis is -1, and requires all the inputs must have the - // same shape when axis is not -1. For convenience, we should - // broadcast the original input x and y to transformed_x and - // transformed_x firstly, then use tmp tensor to get the op - // output, last reduce the tmp tensor shape to match the - // paddle output. - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - phi::DenseTensor transformed_x, transformed_y; - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - - auto dout_dims = dout->dims(); - auto stream = dev_ctx.stream(); - framework::NPUAttributeMap attr_input = {{"grad_x", true}, - {"grad_y", true}}; - // Reshape info vector. - std::vector reduce_axes; - - if (dx && dy) { - dx->mutable_data(ctx.GetPlace()); - dy->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_dx; - tmp_dx.mutable_data(dout_dims, ctx.GetPlace()); - phi::DenseTensor tmp_dy; - tmp_dy.mutable_data(dout_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner("MaximumGrad", - {*dout, transformed_x, transformed_y}, - {tmp_dx, tmp_dy}, - attr_input); - runner.Run(stream); - - if (x_dims != dout_dims) { - reduce_axes.clear(); - int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + x_dims.size()) || - (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dx}, - {*dx}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx); - } - - if (y_dims != dout_dims) { - reduce_axes.clear(); - int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + y_dims.size()) || - (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dy}, - {*dy}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy); - } - - } else if (dx) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(dout_dims, ctx.GetPlace()); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - - dx->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_dx; - tmp_dx.mutable_data(dout_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner("MaximumGrad", - {*dout, transformed_x, transformed_y}, - {tmp_dx, zero_tensor}, - attr_input); - runner.Run(stream); - - if (x_dims != dout_dims) { - reduce_axes.clear(); - - int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + x_dims.size()) || - (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dx}, - {*dx}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx); - } - - } else if (dy) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(dout_dims, ctx.GetPlace()); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - - dy->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_dy; - tmp_dy.mutable_data(dout_dims, ctx.GetPlace()); - - const auto& runner = NpuOpRunner("MaximumGrad", - {*dout, transformed_x, transformed_y}, - {zero_tensor, tmp_dy}, - attr_input); - runner.Run(stream); - - if (y_dims != dout_dims) { - reduce_axes.clear(); - - int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + y_dims.size()) || - (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dy}, - {*dy}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy); - } - } else { - PADDLE_THROW(platform::errors::Unavailable( - "Do not support all outputs to be empty.")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - elementwise_max, - ops::ElementwiseMaxNPUKernel, - ops::ElementwiseMaxNPUKernel, - ops::ElementwiseMaxNPUKernel, - ops::ElementwiseMaxNPUKernel, - ops::ElementwiseMaxNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_max_grad, - ops::ElementwiseMaxGradNPUKernel, - ops::ElementwiseMaxGradNPUKernel, - ops::ElementwiseMaxGradNPUKernel, - ops::ElementwiseMaxGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc deleted file mode 100644 index 18d31430eb242..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseMinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - int axis = ctx.Attr("axis"); - bool direct_compute = false; - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - if (x_dims.size() >= y_dims.size()) { - direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size()); - } else { - direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size()); - } - phi::DenseTensor transformed_x, transformed_y; - if (direct_compute) { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } else { - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - } - const auto& runner = - NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ElementwiseMinGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); - auto stream = dev_ctx.stream(); - if (dx && dy) { - // dx - dx->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_x; - tmp_x.ShareDataWith(*dx); - if (dx->dims() != dout->dims()) { - std::vector dst_dims_vec_x; - std::vector reduce_axes_x; - auto src_dims_x = dx->dims(); - auto dout_dims = dout->dims(); - - int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) || - (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) { - reduce_axes_x.push_back(ax); - } else { - dst_dims_vec_x.push_back(dout_dims[ax]); - } - } - if (!reduce_axes_x.empty()) { - tmp_x.Resize(phi::make_ddim(dst_dims_vec_x)); - } - } - // dy - dy->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_y; - tmp_y.ShareDataWith(*dy); - if (dy->dims() != dout->dims()) { - std::vector dst_dims_vec_y; - std::vector reduce_axes_y; - auto src_dims_y = dy->dims(); - auto dout_dims = dout->dims(); - - int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) || - (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) { - reduce_axes_y.push_back(ax); - } else { - dst_dims_vec_y.push_back(dout_dims[ax]); - } - } - if (!reduce_axes_y.empty()) { - tmp_y.Resize(phi::make_ddim(dst_dims_vec_y)); - } - } - - const auto& runner = NpuOpRunner("MinimumGrad", - {*dout, *x, *y}, - {tmp_x, tmp_y}, - {{"grad_x", true}, {"grad_y", true}}); - runner.Run(stream); - - } else if (dx) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(y->dims(), ctx.GetPlace()); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - // dx - dx->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_x; - tmp_x.ShareDataWith(*dx); - if (dx->dims() != dout->dims()) { - std::vector dst_dims_vec_x; - std::vector reduce_axes_x; - auto src_dims_x = dx->dims(); - auto dout_dims = dout->dims(); - - int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) || - (dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) { - reduce_axes_x.push_back(ax); - } else { - dst_dims_vec_x.push_back(dout_dims[ax]); - } - } - if (!reduce_axes_x.empty()) { - tmp_x.Resize(phi::make_ddim(dst_dims_vec_x)); - } - } - - const auto& runner = NpuOpRunner("MinimumGrad", - {*dout, *x, *y}, - {tmp_x, zero_tensor}, - {{"grad_x", true}, {"grad_y", true}}); - runner.Run(stream); - - } else if (dy) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(x->dims(), ctx.GetPlace()); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - - // dy - dy->mutable_data(ctx.GetPlace()); - phi::DenseTensor tmp_y; - tmp_y.ShareDataWith(*dy); - if (dy->dims() != dout->dims()) { - std::vector dst_dims_vec_y; - std::vector reduce_axes_y; - auto src_dims_y = dy->dims(); - auto dout_dims = dout->dims(); - - int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) || - (dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) { - reduce_axes_y.push_back(ax); - } else { - dst_dims_vec_y.push_back(dout_dims[ax]); - } - } - if (!reduce_axes_y.empty()) { - tmp_y.Resize(phi::make_ddim(dst_dims_vec_y)); - } - } - - const auto& runner = NpuOpRunner("MinimumGrad", - {*dout, *x, *y}, - {zero_tensor, tmp_y}, - {{"grad_x", true}, {"grad_y", true}}); - runner.Run(stream); - - } else { - std::cout << "error" << std::endl; - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - elementwise_min, - ops::ElementwiseMinNPUKernel, - ops::ElementwiseMinNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_min_grad, - ops::ElementwiseMinGradNPUKernel, - ops::ElementwiseMinGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc deleted file mode 100644 index da7895b2481fe..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseModNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - - bool direct_compute = false; - if (x_dims.size() >= y_dims.size()) { - direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size()); - } else { - direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size()); - } - - phi::DenseTensor transformed_x, transformed_y; - if (direct_compute) { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } else { - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - } - out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("FloorMod", {transformed_x, transformed_y}, {*out}, {}); - auto stream = dev_ctx.stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - elementwise_mod, - ops::ElementwiseModNPUKernel, - ops::ElementwiseModNPUKernel, - ops::ElementwiseModNPUKernel, - ops::ElementwiseModNPUKernel, - ops::ElementwiseModNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc deleted file mode 100644 index 9af1293d672fb..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void ReduceDims(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const int axis, - const framework::DDim& ddims, - const framework::DDim& brd_ddims, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - std::vector axes; - int64_t brd_size = brd_ddims.size(); - int64_t org_size = ddims.size(); - // int64_t diff = brd_dims.size() - dims.size(); - for (int64_t i = 0; i < brd_size; ++i) { - if (i < axis || i >= org_size + axis) { - axes.push_back(i); - continue; - } - if (brd_ddims[i] > ddims[i - axis]) { - axes.push_back(i); - } - } - // LOG(INFO) << "axes = " << phi::make_ddim(axes).to_str(); - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); -} - -template -class ElementwiseMulNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int axis = ctx.Attr("axis"); - - bool direct_compute = false; - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - if (x_dims.size() >= y_dims.size()) { - direct_compute = x_dims.size() == (y_dims.size() + axis); - } else { - direct_compute = y_dims.size() == (x_dims.size() + axis); - } - - auto stream = ctx.template device_context().stream(); - - if (direct_compute) { - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); - runner.Run(stream); - } else { - phi::DenseTensor trans_x, trans_y; - NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); - const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {}); - runner.Run(stream); - } - } -}; - -template -class ElementwiseMulGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); - auto stream = ctx.template device_context().stream(); - - phi::DenseTensor trans_x, trans_y; - NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); - - if (dx) { - if (dx->dims() == dout->dims()) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {}); - runner_dx.Run(stream); - } else { - phi::DenseTensor dx_temp(x->type()); - dx_temp.Resize(trans_x.dims()); - dx_temp.mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {}); - runner_dx.Run(stream); - ReduceDims( - ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, dx); - } - } - if (dy) { - if (dy->dims() == dout->dims()) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {}); - runner_dy.Run(stream); - } else { - phi::DenseTensor dy_temp(y->type()); - dy_temp.Resize(trans_y.dims()); - dy_temp.mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {}); - runner_dy.Run(stream); - ReduceDims( - ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, dy); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(elementwise_mul, - ops::ElementwiseMulNPUKernel, - ops::ElementwiseMulNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ElementwiseMulNPUKernel, -#endif - ops::ElementwiseMulNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradNPUKernel, - ops::ElementwiseMulGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ElementwiseMulGradNPUKernel, -#endif - ops::ElementwiseMulGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h deleted file mode 100644 index 9d31036e0c924..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_npu.h +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" - -namespace paddle { -namespace operators { - -template -void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* src, - int axis, - const framework::DDim& dst_dims, - phi::DenseTensor* transformed_src) { - auto stream = dev_ctx.stream(); - - // 1. expand the axis with dim 1 - auto src_dims = src->dims(); - phi::DenseTensor tmp_src; - tmp_src.ShareDataWith(*src); - tmp_src.Resize(src_dims); - for (int i = 0; i < src_dims.size(); ++i) { - if (src_dims[i] == 1 && dst_dims[i + axis] > 1) { - phi::DenseTensor tmp_tensor; - auto tmp_tensor_dims = tmp_src.dims(); - tmp_tensor_dims[i] = dst_dims[i + axis]; - tmp_tensor.mutable_data(tmp_tensor_dims, dev_ctx.GetPlace()); - const auto& runner = - NpuOpRunner("TileWithAxis", - {tmp_src}, - {tmp_tensor}, - {{"axis", static_cast(i)}, - {"tiles", static_cast(dst_dims[i + axis])}}); - runner.Run(stream); - tmp_src.ShareDataWith(tmp_tensor); - tmp_src.Resize(tmp_tensor_dims); - } - } - - // 2.expand the ahead axis - auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis)); - if (prev > 1) { - phi::DenseTensor tmp_tensor; - auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size()); - tmp_tensor.mutable_data(tmp_tensor_dims, dev_ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ExpandD", - {tmp_src}, - {tmp_tensor}, - {{"shape", phi::vectorize(tmp_tensor_dims)}}); - runner.Run(stream); - tmp_src.ShareDataWith(tmp_tensor); - tmp_src.Resize(tmp_tensor_dims); - } else { - tmp_src.Resize(phi::slice_ddim(dst_dims, 0, axis + src_dims.size())); - } - - // 3.expand the tail axis - auto post = phi::product( - phi::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size())); - if (post > 1) { - auto src_dims_vec = phi::vectorize(tmp_src.dims()); - src_dims_vec.push_back(1); - tmp_src.Resize(phi::make_ddim(src_dims_vec)); - - phi::DenseTensor tmp_tensor; - tmp_tensor.mutable_data(dst_dims, dev_ctx.GetPlace()); - const auto& runner = - NpuOpRunner("TileWithAxis", - {tmp_src}, - {tmp_tensor}, - {{"axis", static_cast(axis + src_dims.size())}, - {"tiles", static_cast(post)}}); - runner.Run(stream); - tmp_src.ShareDataWith(tmp_tensor); - } - tmp_src.Resize(dst_dims); - framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src); -} - -template -void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - int axis, - phi::DenseTensor* transformed_x, - phi::DenseTensor* transformed_y) { - auto x_dims = x->dims(); - auto y_dims = y->dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - std::vector dst_dims_vec = phi::vectorize(x_dims); - - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - dst_dims_vec = phi::vectorize(y_dims); - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - int x_axis = is_xsize_larger ? 0 : axis; - int y_axis = is_xsize_larger ? axis : 0; - - PADDLE_ENFORCE_GE( - axis, - 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LE( - axis, - max_dim, - platform::errors::InvalidArgument( - "Axis should be less than or equal to %d, but received axis is %d.", - max_dim, - axis)); - - for (int i = 0; i < x_dims.size(); ++i) { - dst_dims_vec[i + x_axis] = - std::max(dst_dims_vec[i + x_axis], static_cast(x_dims[i])); - } - for (int i = 0; i < y_dims.size(); ++i) { - dst_dims_vec[i + y_axis] = - std::max(dst_dims_vec[i + y_axis], static_cast(y_dims[i])); - } - - auto dst_dims = phi::make_ddim(dst_dims_vec); - NpuBroadcast(dev_ctx, x, x_axis, dst_dims, transformed_x); - NpuBroadcast(dev_ctx, y, y_axis, dst_dims, transformed_y); -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc deleted file mode 100644 index 0a8972ac4792f..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(elementwise_add); -USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_OP_ITSELF(elementwise_sub); -USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); - -template -void Compare(f::Scope *scope, - const p::DeviceContext &ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto y = scope->Var("Y"); - auto tensor_y = y->GetMutable(); - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - std::vector init_y; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_y.push_back(static_cast(2.0)); - } - - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - paddle::framework::TensorFromVector(init_y, ctx, tensor_y); - tensor_y->Resize({10, 10}); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp( - op_type, {{"X", {"X"}}, {"Y", {"Y"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - float expected = 0.0; - if (op_type == "elementwise_add") { - expected = 3.0; - } else if (op_type == "elementwise_sub") { - expected = -1.0; - } - EXPECT_EQ(out_vec.size(), init_x.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(expected)); - } -} - -template -void CompareGrad(f::Scope *scope, - const p::DeviceContext &ctx, - std::string op_type) { - // init - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - tensor_dout->Resize({2, 3, 5}); - - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - tensor_x->Resize({2, 3, 5}); - - auto y = scope->Var("Y"); - auto tensor_y = y->GetMutable(); - tensor_y->Resize({1, 5}); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - auto dy = scope->Var("DY"); - auto tensor_dy = dy->GetMutable(); - - std::vector init_dout; - for (int64_t i = 0; i < tensor_dout->numel(); ++i) { - init_dout.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); - tensor_dout->Resize({2, 3, 5}); - - // run - f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp( - op_type, - {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}}, - {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, - attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector dx_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); - - std::vector dy_vec; - paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec); - - ctx.Wait(); - float expected_x = 0, expected_y = 0; - if (op_type == "elementwise_add_grad") { - expected_x = 1.0; - expected_y = 6.0; - } else if (op_type == "elementwise_sub_grad") { - expected_x = 1.0; - expected_y = -6.0; - } - - for (uint32_t i = 0; i < dx_vec.size(); i++) { - EXPECT_EQ(dx_vec[i], static_cast(expected_x)); - } - for (uint32_t i = 0; i < dy_vec.size(); i++) { - EXPECT_EQ(dy_vec[i], static_cast(expected_y)); - } -} - -TEST(elementwise_add, NPU_fp32) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "elementwise_add"); -} - -TEST(elementwise_sub, NPU_fp32) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "elementwise_sub"); -} - -TEST(elementwise_sub, NPU_fp16) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "elementwise_sub"); -} - -TEST(elementwise_sub_grad, NPU) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx, "elementwise_sub_grad"); -} - -TEST(elementwise_add_grad, NPU) { - f::Scope scope; - auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx, "elementwise_add_grad"); -} diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc deleted file mode 100644 index d0cf1ac28b1c6..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc +++ /dev/null @@ -1,242 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" - -namespace paddle { -namespace operators { - -template -class ElementwisePowNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - int axis = ctx.Attr("axis"); - - out->mutable_data(place); - - bool direct_compute = false; - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = - (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis); - if (x_dims.size() >= y_dims.size()) { - direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size()); - } else { - direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size()); - } - - auto stream = dev_ctx.stream(); - - if (direct_compute) { - const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {}); - runner.Run(stream); - } else { - phi::DenseTensor transformed_x, transformed_y; - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - const auto& runner = - NpuOpRunner("Pow", {transformed_x, transformed_y}, {*out}, {}); - runner.Run(stream); - } - } -}; - -template -class ElementwisePowGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - auto place = ctx.GetPlace(); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - axis = - (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis); - phi::DenseTensor transformed_x, transformed_y; - NpuElementWiseOpBroadcast( - dev_ctx, x, y, axis, &transformed_x, &transformed_y); - - auto dout_dims = dout->dims(); - auto stream = dev_ctx.stream(); - // Reshape info vector. - std::vector reduce_axes; - if (dx) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(dout_dims, place); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - - dx->mutable_data(place); - phi::DenseTensor tmp_dx; - tmp_dx.mutable_data(dout_dims, place); - - // dx = dout * y * pow(x, y - 1); - phi::DenseTensor PowGrad_dx_temp1(dout->type()); - PowGrad_dx_temp1.mutable_data(dout->dims(), place); - const auto& runner_PowGrad_dx_temp1 = - NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {}); - runner_PowGrad_dx_temp1.Run(stream); - - phi::DenseTensor one_dx(transformed_y.type()); - one_dx.mutable_data(transformed_y.dims(), place); - const auto& runner_one_dx = - NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {}); - runner_one_dx.Run(stream); - - phi::DenseTensor sub_dx(transformed_y.type()); - sub_dx.mutable_data(transformed_y.dims(), place); - const auto& runner_sub_dx = - NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {}); - runner_sub_dx.Run(stream); - - phi::DenseTensor PowGrad_dx_temp2(transformed_x.type()); - PowGrad_dx_temp2.mutable_data(transformed_x.dims(), place); - const auto& runner_PowGrad_dx_temp2 = - NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {}); - runner_PowGrad_dx_temp2.Run(stream); - - const auto& runner_dx = NpuOpRunner( - "Mul", {PowGrad_dx_temp1, PowGrad_dx_temp2}, {tmp_dx}, {}); - runner_dx.Run(stream); - - if (x_dims != dout_dims) { - reduce_axes.clear(); - - int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + x_dims.size()) || - (dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dx}, - {*dx}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dx, place, dev_ctx, dx); - } - } - if (dy) { - phi::DenseTensor zero_tensor(dout->type()); - zero_tensor.mutable_data(dout_dims, place); - FillNpuTensorWithConstant(&zero_tensor, static_cast(0)); - - dy->mutable_data(place); - phi::DenseTensor tmp_dy; - tmp_dy.mutable_data(dout_dims, place); - - // dy = dout * log(x) * pow(x, y) - phi::DenseTensor PowGrad_dy_temp1(transformed_x.type()); - PowGrad_dy_temp1.mutable_data(transformed_x.dims(), place); - const auto& runner_PowGrad_dy_temp1 = NpuOpRunner( - "Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {}); - runner_PowGrad_dy_temp1.Run(stream); - - phi::DenseTensor one_dy(transformed_x.type()); - one_dy.mutable_data(transformed_x.dims(), place); - const auto& runner_one_dy = - NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {}); - runner_one_dy.Run(stream); - - phi::DenseTensor sub_dy(transformed_x.type()); - sub_dy.mutable_data(transformed_x.dims(), place); - const auto& runner_sub_dy = - NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {}); - runner_sub_dy.Run(stream); - - phi::DenseTensor log_dy(transformed_x.type()); - log_dy.mutable_data(transformed_x.dims(), place); - const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {}); - runner_log_dy.Run(stream); - - phi::DenseTensor PowGrad_dy_temp2(transformed_x.type()); - PowGrad_dy_temp2.mutable_data(transformed_x.dims(), place); - const auto& runner_PowGrad_dy_temp2 = NpuOpRunner( - "Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {}); - runner_PowGrad_dy_temp2.Run(stream); - - const auto& runner_dy = - NpuOpRunner("Mul", {*dout, PowGrad_dy_temp2}, {tmp_dy}, {}); - runner_dy.Run(stream); - - if (y_dims != dout_dims) { - reduce_axes.clear(); - - int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0); - for (int ax = 0; ax < dout_dims.size(); ++ax) { - if ((ax < src_axis || ax >= src_axis + y_dims.size()) || - (dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) { - reduce_axes.push_back(ax); - } - } - if (!reduce_axes.empty()) { - const auto& runner = - NpuOpRunner("ReduceSumD", - {tmp_dy}, - {*dy}, - {{"axes", reduce_axes}, {"keep_dims", false}}); - runner.Run(stream); - } - } else { - framework::TensorCopy(tmp_dy, place, dev_ctx, dy); - } - } - if (!dx && !dy) { - PADDLE_THROW(platform::errors::Unavailable( - "Not support all outputs to be empty.")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - elementwise_pow, - ops::ElementwisePowNPUKernel, - ops::ElementwisePowNPUKernel, - ops::ElementwisePowNPUKernel, - ops::ElementwisePowNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_pow_grad, - ops::ElementwisePowGradNPUKernel, - ops::ElementwisePowGradNPUKernel, - ops::ElementwisePowGradNPUKernel, - ops::ElementwisePowGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc deleted file mode 100644 index 2b9d83cc57d97..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ /dev/null @@ -1,189 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/elementwise/elementwise_op.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseSubNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class ElementwiseSubGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with - // default axis=-1? - // So, the sub_grad should do reduce if needed. - // For example, the shape of each variable in elementwise_sub: - // x, dx: [2, 3, 5] - // y, dy: [1, 5] - // out, dout: [2, 3, 5] - // Then, out = x - y => dx = dout, dy = -dout - // And, the shape of dy can be computed by two stages reduce, - // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. - // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. - - if (dx) { - dx->mutable_data(ctx.GetPlace()); - // For dx - // stage 1 - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - phi::DenseTensor* tmp_dout = const_cast(dout); - phi::DenseTensor reduced_dout(dx->type()); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - reduced_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ReduceSumD", - {*dout}, - {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { - axes.push_back(i); - } - } - if (axes.size() != 0) { - const auto& runner = NpuOpRunner("ReduceSumD", - {*tmp_dout}, - {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - } else { - framework::TensorCopy( - *tmp_dout, - ctx.GetPlace(), - ctx.template device_context(), - dx); - } - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - // For dy - // stage 1 - auto reduce_ndim = dout->dims().size() - dy->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - phi::DenseTensor* tmp_dout = const_cast(dout); - phi::DenseTensor reduced_dy(dy->type()); - phi::DenseTensor reduced_dout(dy->type()); - - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - reduced_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ReduceSumD", - {*dout}, - {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - phi::DenseTensor* tmp_dy = tmp_dout; - for (auto i = 0; i < dy->dims().size(); ++i) { - if (dy->dims()[i] == 1) { - axes.push_back(i); - } - } - if (axes.size() != 0) { - reduced_dy.Resize(dy->dims()); - reduced_dy.mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReduceSumD", - {*tmp_dout}, - {reduced_dy}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - tmp_dy = &reduced_dy; - } - - // stage 3, negative - const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(elementwise_sub, - ops::ElementwiseSubNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ElementwiseSubNPUKernel, -#endif - ops::ElementwiseSubNPUKernel, - ops::ElementwiseSubNPUKernel); - -REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, - ops::ElementwiseSubGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ElementwiseSubGradNPUKernel, -#endif - ops::ElementwiseSubGradNPUKernel, - ops::ElementwiseSubGradNPUKernel); diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc deleted file mode 100644 index 77f12f17ce258..0000000000000 --- a/paddle/fluid/operators/expand_as_v2_op_npu.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/expand_as_v2_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandAsV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, - rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, - rank)); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument("The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, - MAX_RANK_SUPPORTED)); - ExpandAs(context); - } - - protected: - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], - 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], - target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in " - "target tensor for expand_as_v2 op.", - vec_in_dims[i], - target_shape[i])); - } - } - auto* out0 = context.Output("Out"); - - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - out0->mutable_data(context.GetPlace()); - - const auto& runner = - NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}}); - - auto stream = - context.template device_context() - .stream(); - - runner.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel, - ops::ExpandAsV2NPUKernel); diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc deleted file mode 100644 index d7e553b83bb67..0000000000000 --- a/paddle/fluid/operators/expand_op_npu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'x' for Op(expand) " - "must be greater than or equal to 1, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'x' for Op(expand) " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - switch (rank) { - case 1: - Expand<1>(context); - break; - case 2: - Expand<2>(context); - break; - case 3: - Expand<3>(context); - break; - case 4: - Expand<4>(context); - break; - case 5: - Expand<5>(context); - break; - case 6: - Expand<6>(context); - break; - } - } - - protected: - template - void Expand(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto expand_times = get_expand_times(context); - PADDLE_ENFORCE_EQ(static_cast(in_dims.size()), - expand_times.size(), - platform::errors::InvalidArgument( - "The number of elements (%d) of 'expand_times' for " - "Op(expand) must be equal to the number " - "of dimensions (%d) of the input.", - expand_times.size(), - static_cast(in_dims.size()))); - auto* out0 = context.Output("Out"); - framework::DDim out_dims(in_dims); - - for (size_t i = 0; i < expand_times.size(); ++i) { - out_dims[i] *= expand_times[i]; - } - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - out0->Resize(out_dims); - out0->mutable_data(place); - - bool is_expand_times_all_one = - (out0->numel() == in0->numel()) ? true : false; - - if (is_expand_times_all_one) { - memory::Copy(place, - out0->mutable_data(place), - place, - in0->data(), - in0->numel() * sizeof(T), - stream); - if (out_dims != in_dims) { - out0->Resize(out_dims); - } - } else { - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand, - ops::ExpandNPUKernel, - ops::ExpandNPUKernel, - ops::ExpandNPUKernel); diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc deleted file mode 100644 index e9d12beaa78de..0000000000000 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(expand); -USE_OP_DEVICE_KERNEL(expand, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto in = scope->Var("X"); - auto expand_times = scope->Var("ExpandTimes"); - auto out = scope->Var("Out"); - auto in_t = in->GetMutable(); - auto out_t = out->GetMutable(); - auto expand_times_t = expand_times->GetMutable(); - - auto place = ctx.GetPlace(); - paddle::framework::TensorFromVector(std::vector(3 * 1 * 7, 1), ctx, in_t); - paddle::framework::TensorFromVector( - std::vector({1, 10, 1}), ctx, expand_times_t); - - in_t->Resize(phi::make_ddim({3, 1, 7})); - expand_times_t->Resize(phi::make_ddim({3})); - out_t->Resize(phi::make_ddim({3, 10, 7})); - out_t->mutable_data(place); - - f::AttributeMap attrs = {{}}; - auto op = - f::OpRegistry::CreateOp("expand", - {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}}, - {{"Out", {"Out"}}}, - attrs); - op->Run(*scope, place); - ctx.Wait(); - - auto out_dim = out_t->dims(); - EXPECT_EQ(out_dim.at(0), 3); - EXPECT_EQ(out_dim.at(1), 10); - EXPECT_EQ(out_dim.at(2), 7); -} - -TEST(expand, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc deleted file mode 100644 index 7f37fc67d529d..0000000000000 --- a/paddle/fluid/operators/expand_v2_op_npu.cc +++ /dev/null @@ -1,235 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_v2_op.h" - -namespace paddle { -namespace operators { - -template -class ExpandV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Out = ctx.Output("Out"); - - auto in_dims = X->dims(); - auto expand_shape = get_expand_shape(ctx); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = expand_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector final_expand_shape(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(expand_shape[i], - 0, - platform::errors::InvalidArgument( - "The expanded size cannot be zero.")); - if (i < diff) { // expand_shape = [3,4,-1,-1], X = [10,2] --> - // final_expand_shape = [3,4,10,2] - PADDLE_ENFORCE_GT( - expand_shape[i], - 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_v2 op.", - expand_shape[i])); - final_expand_shape[i] = expand_shape[i]; - } else if (expand_shape[i] > 0) { // expand_shape = [3,4,10,4], X = - // [10,1] --> final_expand_shape = - // [3,4,10,4] - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], - expand_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_v2 op.", - vec_in_dims[i], - expand_shape[i])); - final_expand_shape[i] = expand_shape[i]; - } else { - final_expand_shape[i] = expand_shape[i]; - } - } else { // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape - // = [3,4,10,2] - PADDLE_ENFORCE_EQ( - expand_shape[i], - -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_v2 op, " - "only -1 is supported, but the value received is %d.", - expand_shape[i])); - final_expand_shape[i] = vec_in_dims[i]; - } - } - - framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}}; - - auto rank = X->dims().size(); - - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The rank of the input 'X' for expand_v2_npu op must be positive, " - "but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'X' for expand_v2_npu op must be less than " - "or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - auto shape_size = final_expand_shape.size(); - PADDLE_ENFORCE_GE( - shape_size, - rank, - platform::errors::InvalidArgument( - "The number (%d) of elements of 'shape' for expand_v2_npu op must " - "be " - "greater than or equal to the rank (%d) of the input 'X'.", - shape_size, - rank)); - PADDLE_ENFORCE_LE(shape_size, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number (%d) of elements of 'shape' for " - "expand_v2_npu op must be " - "less than or equal to %d.", - shape_size, - MAX_RANK_SUPPORTED)); - - framework::DDim out_dims = phi::make_ddim(final_expand_shape); - Out->Resize(out_dims); - Out->mutable_data(ctx.GetPlace()); - - const auto& dev_ctx = - ctx.template device_context(); - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(X->dtype()) == - framework::proto::VarType::BOOL) { - NpuOpRunner::TypeAdapter({*X}, - {*Out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } else if (framework::TransToProtoVarType(X->dtype()) == - framework::proto::VarType::INT64) { - NpuOpRunner::TypeAdapter({*X}, - {*Out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input); - runner.Run(dev_ctx.stream()); - } - } -}; - -template -class ExpandV2NPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - // case 1: reduce dout dims to dx dims - // For example: [2, 120] --> [120] - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - - phi::DenseTensor tmp_dout(dout->dtype()); - phi::DenseTensor reduced_dout(dx->dtype()); - tmp_dout.ShareDataWith(*dout); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - tmp_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.Resize(phi::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReduceSumD", - {*dout}, - {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = reduced_dout; - } - - // case 2: reduce axis of dout in which dim is 1 - // For example: [12, 140] --> [1, 140] - - // case 3: copy dout to dx when shape is totally same, and dim in dx != 1 - // For example: [2, 10, 5] --> [2, 10, 5] - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { - axes.push_back(i); - } - } - if (axes.size() != 0) { - const auto& runner = NpuOpRunner("ReduceSumD", - {tmp_dout}, - {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - } else { - framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - expand_v2, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel, - ops::ExpandV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - expand_v2_grad, - ops::ExpandV2NPUGradKernel, - ops::ExpandV2NPUGradKernel, - ops::ExpandV2NPUGradKernel); diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc deleted file mode 100644 index ee71ebee9b066..0000000000000 --- a/paddle/fluid/operators/eye_op_npu.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class EyeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto num_rows = ctx.Attr("num_rows"); - - auto d_nums = ctx.Attr("dtype"); - auto dtype = - ConvertToNpuDtype(static_cast(d_nums)); - - auto num_columns = ctx.Attr("num_columns"); - if (num_columns == -1) num_columns = num_rows; - - framework::NPUAttributeMap attr_input = { - {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}}; - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - eye, - ops::EyeNPUKernel, - ops::EyeNPUKernel, - ops::EyeNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc deleted file mode 100644 index 62d3e5a82f5a3..0000000000000 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class FillAnyLikeNPUKernel : public framework::OpKernel { - public: - using CommonType = typename std::common_type< - float, - typename std::conditional::value, - float, - T>::type>::type; - - void Compute(const framework::ExecutionContext& context) const override { - auto data_type = static_cast( - context.Attr("dtype")); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - float value = context.Attr("value"); - - auto common_type_value = static_cast(value); - - PADDLE_ENFORCE_EQ( - (common_type_value >= - static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, - platform::errors::InvalidArgument( - "The filled value is out of range for target type, " - "current kernel type is %s, the range should between %f " - "and %f, but now value is %f.", - typeid(T).name(), - static_cast(std::numeric_limits::lowest()), - static_cast(std::numeric_limits::max()), - value)); - - PADDLE_ENFORCE_EQ( - std::isnan(value), - false, - platform::errors::InvalidArgument("The filled value is NaN.")); - - Tensor tensor_tmp(framework::TransToPhiDataType(data_type)); - tensor_tmp.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&tensor_tmp, static_cast(value)); - - auto stream = - context.template device_context() - .stream(); - - auto shape = out->dims(); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_tmp) - .AddOutput(*out) - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(fill_any_like, - ops::FillAnyLikeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::FillAnyLikeNPUKernel, -#endif - ops::FillAnyLikeNPUKernel, - ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc deleted file mode 100644 index fed75fc018a0c..0000000000000 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto data_type = - static_cast(ctx.Attr("dtype")); - auto float_value = ctx.Attr("value"); - auto str_value = ctx.Attr("str_value"); - auto force_cpu = ctx.Attr("force_cpu"); - - auto *out = ctx.Output("Out"); - auto *in = ctx.Input("Input"); - if (in->lod().size() && ctx.Attr("input_dim_idx") == 0) { - // set the correct batch size for the phi::DenseTensor. - auto odims = out->dims(); - int output_dim_idx = ctx.Attr("output_dim_idx"); - odims[output_dim_idx] = static_cast(in->lod().back().size()) - 1; - out->mutable_data(odims, ctx.GetPlace()); - } - - T value; - if (str_value.empty()) { - value = static_cast(float_value); - } else { - // handle NaN/Inf first, which cannot be read from stream. - if (str_value == "inf") { - value = static_cast(std::numeric_limits::infinity()); - } else if (str_value == "-inf") { - value = static_cast(-std::numeric_limits::infinity()); - } else if (str_value == "nan") { - value = static_cast(std::numeric_limits::quiet_NaN()); - } else { - std::stringstream convert_stream(str_value); - if (std::is_same::value) { - int64_t tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } else { - double tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } - } - } - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); - if (cpu_place) { - auto &dev_ctx = *pool.Get(platform::CPUPlace()); - phi::funcs::SetConstant functor; - out->mutable_data(platform::CPUPlace(), - framework::TransToPhiDataType(data_type)); - functor(reinterpret_cast(dev_ctx), - out, - static_cast(value)); - } else { - out->mutable_data(ctx.GetPlace(), - framework::TransToPhiDataType(data_type)); - phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type)); - tensor_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_tmp, value); - - auto stream = - ctx.template device_context() - .stream(); - const auto &runner = NpuOpRunner("FillD", - {tensor_tmp}, - {*out}, - {{"dims", phi::vectorize(out->dims())}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - float>, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - int>, - ops::FillConstantBatchSizeLikeOpNPUKernel< - paddle::platform::NPUDeviceContext, - paddle::platform::float16>); diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc deleted file mode 100644 index 0724caf32793e..0000000000000 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" - -namespace paddle { -namespace operators { - -template -class FillConstantNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto data_type = - static_cast(ctx.Attr("dtype")); - auto str_value = ctx.Attr("str_value"); - auto float_value = ctx.Attr("value"); - - auto *out_var = ctx.Output("Out"); - auto stream = - ctx.template device_context() - .stream(); - - T value; - if (str_value.empty()) { - value = static_cast(float_value); - } else { - // handle NaN/Inf first, which cannot be read from stream. - if (str_value == "inf") { - value = static_cast(std::numeric_limits::infinity()); - } else if (str_value == "-inf") { - value = static_cast(-std::numeric_limits::infinity()); - } else if (str_value == "nan") { - value = static_cast(std::numeric_limits::quiet_NaN()); - } else { - std::stringstream convert_stream(str_value); - if (std::is_same::value) { - int64_t tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } else { - double tmp_value; - convert_stream >> tmp_value; - value = static_cast(tmp_value); - } - } - } - auto shape = GetShape(ctx); - - out_var->mutable_data(shape, ctx.GetPlace()); - if (data_type != framework::proto::VarType::BOOL) { - Tensor tensor_value(framework::TransToPhiDataType(data_type)); - tensor_value.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_value, value); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_value) - .AddOutput(*out_var) - .Run(stream); - } else { - const auto &dev_ctx = - ctx.template device_context(); - auto op_func = [&shape, &value]( - const std::vector &inputs, - const std::vector &outputs, - const NPUAttributeMap &attrs, - const platform::NPUDeviceContext &dev_ctx) { - Tensor tensor_value; - tensor_value.mutable_data({1}, dev_ctx.GetPlace()); - FillNpuTensorWithConstant(&tensor_value, - static_cast(value)); - - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(shape)) - .AddInput(tensor_value) - .AddOutput(outputs[0]) - .Run(dev_ctx.stream()); - }; - NpuOpRunner::TypeAdapter({}, - {*out_var}, - {}, - dev_ctx, - op_func, - {}, - {framework::proto::VarType::UINT8}); - } - } -}; -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - fill_constant, - paddle::operators::FillConstantNPUKernel, - paddle::operators::FillConstantNPUKernel, - paddle::operators::FillConstantNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::FillConstantNPUKernel, -#endif - paddle::operators::FillConstantNPUKernel); diff --git a/paddle/fluid/operators/fill_zeros_like_op_npu.cc b/paddle/fluid/operators/fill_zeros_like_op_npu.cc deleted file mode 100644 index 6cedc658f76f5..0000000000000 --- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/fill_zeros_like_op.h" - -namespace paddle { -namespace operators { - -template -class FillZerosLikeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - - out->mutable_data(context.GetPlace()); - auto stream = - context.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - fill_zeros_like, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel, - ops::FillZerosLikeNPUKernel); diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc deleted file mode 100644 index 2e43c33efd575..0000000000000 --- a/paddle/fluid/operators/flatten_op_npu.cc +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/flatten_op.h" - -namespace paddle { -namespace operators { - -template -class Flatten2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *in = context.Input("X"); - auto *out = context.Output("Out"); - auto &axis = context.Attr("axis"); - out->mutable_data(context.GetPlace(), in->type()); - framework::NPUAttributeMap attr_input = {{"axis", axis}}; - - auto stream = - context.template device_context() - .stream(); - const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input); - runner.Run(stream); - } -}; - -template -class Flatten2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - - auto xshape_dims = ctx.Input("XShape")->dims(); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(x_dims); - } -}; - -template -class FlattenContiguousRangeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *X = ctx.Input("X"); - auto *Out = ctx.Output("Out"); - int start_axis = ctx.Attr("start_axis"); - int stop_axis = ctx.Attr("stop_axis"); - - Out->mutable_data(ctx.GetPlace()); - - const auto &runner = - NpuOpRunner("FlattenV2", - {*X}, - {*Out}, - {{"axis", static_cast(start_axis)}, - {"end_axis", static_cast(stop_axis)}}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class FlattenContiguousRangeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_out = ctx.Input(framework::GradVarName("Out")); - - auto xshape_dims = ctx.Input("XShape")->dims(); - auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(x_dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(flatten2, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel, - ops::Flatten2NPUKernel); -REGISTER_OP_NPU_KERNEL(flatten2_grad, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel, - ops::Flatten2GradNPUKernel); - -REGISTER_OP_NPU_KERNEL( - flatten_contiguous_range, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel, - ops::FlattenContiguousRangeNPUKernel); -REGISTER_OP_NPU_KERNEL( - flatten_contiguous_range_grad, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel, - ops::FlattenContiguousRangeGradNPUKernel); diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc deleted file mode 100644 index feb1567e58d78..0000000000000 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class GatherNdNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); - - out->template mutable_data(ctx.GetPlace()); - - if (x->numel() == 0) return; - - if (index->numel() == 0) { - framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out); - return; - } - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, - true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = ctx.template device_context().stream(); - runner.Run(stream); - } -}; - -template -class GatherNdGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *index = ctx.Input("Index"); - auto *x = ctx.Input("X"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto *p = dx->mutable_data(ctx.GetPlace()); - - if (dx->numel() == 0) return; - - if (index->numel() == 0) { - framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx); - return; - } - - phi::DenseTensor tmp_tensor(index->type()); - phi::DenseTensor tmp_tensor2(dout->type()); - const auto index_dims = index->dims(); - if (index_dims.size() == 1) { - tmp_tensor.ShareDataWith(*index); - std::vector new_dim = {1, index_dims[0]}; - tmp_tensor.Resize(phi::make_ddim(new_dim)); - index = &tmp_tensor; - - tmp_tensor2.ShareDataWith(*dout); - std::vector new_dim2{1}; - for (int i = index->numel(); i < x->dims().size(); i++) { - new_dim2.push_back(x->dims()[i]); - } - tmp_tensor2.Resize(phi::make_ddim(new_dim2)); - dout = &tmp_tensor2; - } - - auto stream = ctx.template device_context().stream(); - platform::NPUMemsetAsync( - static_cast(p), 0, dx->numel() * sizeof(T), stream); - - const auto &runner_scatter = NpuOpRunner( - "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}}); - runner_scatter.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(gather_nd, - ops::GatherNdNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::GatherNdNPUKernel, -#endif - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL(gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc deleted file mode 100644 index ab42d78a0a1d7..0000000000000 --- a/paddle/fluid/operators/gather_op_npu.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class GatherOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - const auto &runner = NpuOpRunner( - "Gather", {*x, *index}, {*out}, {{"validate_indices", true}}); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class GatherGradOpNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *index = ctx.Input("Index"); - auto *x = ctx.Input("X"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - // step1: Unsqueeze index - phi::DenseTensor tmp_tensor(index->type()); - const auto index_dims = index->dims(); - if (index_dims.size() == 1) { - tmp_tensor.ShareDataWith(*index); - std::vector new_dim = {index_dims[0], 1}; - tmp_tensor.Resize(phi::make_ddim(new_dim)); - index = &tmp_tensor; - } - - auto stream = - ctx.template device_context() - .stream(); - - // step2: ZerosLike x in device - Tensor zeroslike_xout(dx->type()); - zeroslike_xout.Resize(x->dims()); - auto p = zeroslike_xout.mutable_data(ctx.GetPlace()); - - platform::NPUMemsetAsync( - static_cast(p), 0, zeroslike_xout.numel() * sizeof(T), stream); - - // step3: scatter(x_grad) - const auto &runner_scatter = NpuOpRunner( - "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); - runner_scatter.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather, - ops::GatherOpNPUKernel, - ops::GatherOpNPUKernel, - ops::GatherOpNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_grad, - ops::GatherGradOpNPUKernel, - ops::GatherGradOpNPUKernel, - ops::GatherGradOpNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc deleted file mode 100644 index 69d82ecaedeea..0000000000000 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(gather); -USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP_ITSELF(gather_grad); -USE_OP_DEVICE_KERNEL(gather_grad, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto index = scope->Var("Index"); - auto tensor_index = index->GetMutable(); - - std::vector init_x; - for (int64_t i = 1; i < 7; ++i) { - // 1,2,3,4,5,6 - init_x.push_back(static_cast(i)); - } - - // [[1, 2],[3, 4],[5, 6]] - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({3, 2})); - - std::vector init_index = {1, 2}; - paddle::framework::TensorFromVector(init_index, ctx, tensor_index); - tensor_index->Resize(phi::make_ddim({2})); - - ctx.Wait(); - - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - f::AttributeMap attrs = {{"validate_indices", true}}; - auto op = f::OpRegistry::CreateOp( - op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather - for (int i = 0; i < static_cast(out_vec.size()); ++i) { - VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; - } - uint32_t expected_size = 4; - EXPECT_EQ((uint32_t)out_vec.size(), expected_size); - - // {3, 4, 5, 6} - std::vector expected_out_vec; - for (int64_t i = 3; i < 7; ++i) { - expected_out_vec.push_back(static_cast(i)); - } - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], expected_out_vec[i]); - } -} - -template -void CompareGrad(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto index = scope->Var("Index"); - auto tensor_index = index->GetMutable(); - - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - std::vector init_index = {0, 1}; - paddle::framework::TensorFromVector(init_index, ctx, tensor_index); - tensor_index->Resize(phi::make_ddim({2})); - - std::vector init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({3, 2})); - - std::vector init_dout = {5.0, 10.0, 2.0, 3.0}; - paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); - tensor_dout->Resize(phi::make_ddim({2, 2})); - - ctx.Wait(); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - // run - f::AttributeMap attrs; - auto op = f::OpRegistry::CreateOp( - op_type, - {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - - std::vector dx_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); - - ctx.Wait(); - - uint32_t expected_size = 3 * 2; - EXPECT_EQ((uint32_t)dx_vec.size(), expected_size); - - std::vector expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0}; - for (uint32_t i = 0; i < dx_vec.size(); i++) { - VLOG(3) << "dx_vec[i]=" << dx_vec[i]; - EXPECT_EQ(dx_vec[i], expected_dx_vec[i]); - } -} - -TEST(gather, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "gather"); -} - -TEST(gather, NPU_fp16) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "gather"); -} - -TEST(gather_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx, "gather_grad"); -} diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc deleted file mode 100644 index 9b3c23ad2b9c1..0000000000000 --- a/paddle/fluid/operators/gaussian_random_op_npu.cc +++ /dev/null @@ -1,60 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/generator.h" - -namespace paddle { -namespace operators { - -template -class NPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - tensor->mutable_data(context.GetPlace()); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::normal_distribution dist(mean, std); - - int64_t size = tensor->numel(); - - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - cpu_data[i] = dist(*engine); - } - framework::TensorCopy( - cpu_tensor, - context.GetPlace(), - context.template device_context(), - tensor); - context.template device_context() - .Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(gaussian_random, ops::NPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc deleted file mode 100644 index 1b40a6fbb454c..0000000000000 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -template -class GeluNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class GeluGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(pangyoki): In the original implementation of GeluGrad op, the input - // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable - // `out` was not actually used. In order to improve performance, the - // useless GELU operation was deleted. - // We directly use `*dout` as a placeholder to replace `out`, it will not - // be used in calculations. - const auto& runner_dx = - NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - gelu, - ops::GeluNPUKernel, - ops::GeluNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gelu_grad, - ops::GeluGradNPUKernel, - ops::GeluGradNPUKernel); diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc deleted file mode 100644 index 9dca0bb8cba0f..0000000000000 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(gelu); -USE_OP_DEVICE_KERNEL(gelu, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - f::AttributeMap attrs; - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - - auto op = f::OpRegistry::CreateOp( - "gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - op->Run(*scope, place); - - ctx.Wait(); - - // eval time - struct timeval start, end; - gettimeofday(&start, NULL); - - for (int i = 0; i < 100; i++) { - op->Run(*scope, place); - } - - ctx.Wait(); - - gettimeofday(&end, NULL); - int micros = - (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); - printf("used time: %d\n", micros / 100); - - // eval value - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - float expected = 0.841192; - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_FLOAT_EQ(out_vec[i], static_cast(expected)); - } -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init_dout; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_dout.push_back(static_cast(1.0)); - } - - std::vector init_x; - for (int64_t i = 0; i < 10 * 10; ++i) { - init_x.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout); - tensor_dout->Resize({10, 10}); - paddle::framework::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize({10, 10}); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - f::AttributeMap attrs; - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - - auto op = f::OpRegistry::CreateOp("gelu_grad", - {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - op->Run(*scope, place); - - ctx.Wait(); - - // eval time - struct timeval start, end; - gettimeofday(&start, NULL); - - for (int i = 0; i < 100; i++) { - op->Run(*scope, place); - } - - ctx.Wait(); - - gettimeofday(&end, NULL); - int micros = - (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); - printf("used time: %d\n", micros / 100); - - // eval value - std::vector dx_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec); - - float expected = 1.082964; - for (uint32_t i = 0; i < dx_vec.size(); i++) { - EXPECT_FLOAT_EQ(dx_vec[i], static_cast(expected)); - } -} - -TEST(gelu, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(gelu_grad, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc deleted file mode 100644 index 49fdd3566825b..0000000000000 --- a/paddle/fluid/operators/group_norm_op_npu.cc +++ /dev/null @@ -1,327 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -struct GroupNormFunction { - public: - explicit GroupNormFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - } - void ReduceMean(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - // y should be init first - const auto& runner = NpuOpRunner( - "ReduceMeanD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void ReduceSum(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - // y should be init first - const auto& runner = NpuOpRunner( - "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Div(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void DivNoNan(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // y should be init first - const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - // y should be init first - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Sqrt(const phi::DenseTensor* x, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - // y should be init first - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x, - const DataLayout& data_layout, - const int64_t N, - const int64_t C, - const int64_t H, - const int64_t W, - const int G) { - phi::DenseTensor y(x->type()); - // y.mutable_data( {N,G,1}, place ); - if (data_layout == DataLayout::kNCHW) { - y.mutable_data({N, G, 1}, place); - // shape of x is [N, G, C*H*W/G] - this->ReduceMean(x, &y, std::vector{2}); - } else { - y.mutable_data({N, 1, G}, place); - // shape of x is [N, C*H*W/G, G] - phi::DenseTensor x_trans(x->type()); - x_trans.mutable_data({N, G, C * H * W / G}, place); - this->Transpose(x, &x_trans, std::vector{0, 2, 1}); - this->ReduceMean(&x_trans, &y, std::vector{2}); - } - return y; - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; -}; - -template -class GroupNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - const float epsilon = ctx.Attr("epsilon"); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* x = ctx.Input("X"); - - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* var = ctx.Output("Variance"); - const auto groups = ctx.Attr("groups"); - - auto place = ctx.GetPlace(); - phi::DenseTensor xnorm(x->type()); - xnorm.mutable_data(x->dims(), place); - GroupNormFunction F(ctx); - if (data_layout != DataLayout::kNCHW) { - xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); - F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); - } else { - paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm); - } - auto N = xnorm.dims()[0]; - auto C = xnorm.dims()[1]; - auto H = xnorm.dims()[2]; - auto W = xnorm.dims()[3]; - xnorm.Resize({N * groups, C * H * W / groups}); - std::vector axis = {1}; - auto reduce_dim = mean->dims(); - - mean->mutable_data({N * groups, 1}, place); - var->mutable_data({N * groups, 1}, place); - y->mutable_data(place); - F.ReduceMean(&xnorm, mean, axis); - - F.Sub(&xnorm, mean, &xnorm); - phi::DenseTensor sqr(x->type()); - sqr.mutable_data(xnorm.dims(), place); - - F.Mul(&xnorm, &xnorm, &sqr); - F.ReduceMean(&sqr, var, axis); - phi::DenseTensor std(x->type()); - std.mutable_data(var->dims(), place); - F.Adds(var, epsilon, &std); - F.Sqrt(&std, &std); - y->Resize(xnorm.dims()); - F.Div(&xnorm, &std, y); - y->Resize({N, C, H, W}); - if (scale) { - phi::DenseTensor scale_t(scale->type()); - scale_t.ShareDataWith(*scale); - scale_t.Resize({C, 1, 1}); - F.Mul(y, &scale_t, y); - } - if (bias) { - phi::DenseTensor bias_t(bias->type()); - bias_t.ShareDataWith(*bias); - bias_t.Resize({C, 1, 1}); - F.Add(y, &bias_t, y); - } - if (data_layout != DataLayout::kNCHW) { - F.Transpose(y, y, std::vector{0, 2, 3, 1}); - y->Resize({x->dims()}); - } - mean->Resize(reduce_dim); - var->Resize(reduce_dim); - } -}; - -template -class GroupNormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - const float epsilon = ctx.Attr("epsilon"); - auto* y = ctx.Input("Y"); - auto* var = ctx.Input("Variance"); - - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto* d_y = ctx.Input(framework::GradVarName("Y")); - const auto G = ctx.Attr("groups"); - - // init output - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - - GroupNormFunction F(ctx); - auto place = ctx.GetPlace(); - auto _type = y->type(); - - phi::DenseTensor xnorm(_type); - xnorm.mutable_data(y->dims(), place); - phi::DenseTensor scale_share(_type); - scale_share.ShareDataWith(*scale); - phi::DenseTensor bias_share(_type); - bias_share.ShareDataWith(*bias); - - int64_t N = y->dims()[0]; - int64_t C, H, W; - framework::DDim scale_bias_dim; - if (data_layout == DataLayout::kNCHW) { - C = y->dims()[1]; - H = y->dims()[2]; - W = y->dims()[3]; - scale_bias_dim = phi::make_ddim({C, 1, 1}); - } else { - C = y->dims()[3]; - H = y->dims()[1]; - W = y->dims()[2]; - scale_bias_dim = phi::make_ddim({1, 1, C}); - } - scale_share.Resize(scale_bias_dim); - bias_share.Resize(scale_bias_dim); - F.Sub(y, &bias_share, &xnorm); - F.DivNoNan(&xnorm, &scale_share, &xnorm); - - if (d_bias) { - d_bias->mutable_data(place); - if (data_layout == DataLayout::kNCHW) { - F.ReduceSum(d_y, d_bias, std::vector{0, 2, 3}, false); - } else { - F.ReduceSum(d_y, d_bias, std::vector{0, 1, 2}, false); - } - } - if (d_scale) { - d_scale->mutable_data(place); - phi::DenseTensor dy_xnorm(_type); - dy_xnorm.mutable_data(d_y->dims(), place); - F.Mul(d_y, &xnorm, &dy_xnorm); - if (data_layout == DataLayout::kNCHW) { - F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 2, 3}); - } else { - F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 1, 2}); - } - } - - // std = Sqrt(var+epsilon), init shape = [ N, G ] - phi::DenseTensor std(_type); - std.mutable_data(var->dims(), place); - F.Adds(var, epsilon, &std); - F.Sqrt(&std, &std); - // d_xnorm_std = dy_proc * scale / std - phi::DenseTensor d_xnorm_std(_type); - d_xnorm_std.mutable_data(y->dims(), place); - F.Mul(d_y, &scale_share, &d_xnorm_std); - if (data_layout == DataLayout::kNCHW) { - xnorm.Resize({N, G, C * H * W / G}); - d_xnorm_std.Resize({N, G, C * H * W / G}); - std.Resize({N, G, 1}); - } else { - xnorm.Resize({N, C * H * W / G, G}); - d_xnorm_std.Resize({N, C * H * W / G, G}); - std.Resize({N, 1, G}); - } - F.Div(&d_xnorm_std, &std, &d_xnorm_std); - - // d_x = d_xnorm_std - // - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm - // - Mean ( d_xnorm_std, axis=1, keepdim=True ) - d_x->mutable_data(place); - d_x->Resize(xnorm.dims()); - F.Mul(&d_xnorm_std, &xnorm, d_x); - phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G); - F.Mul(&dx1, &xnorm, d_x); - - phi::DenseTensor dx2 = - F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G); - - F.Sub(&d_xnorm_std, d_x, d_x); - F.Sub(d_x, &dx2, d_x); - - d_x->Resize(y->dims()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(group_norm, - ops::GroupNormNPUKernel, - ops::GroupNormNPUKernel); -REGISTER_OP_NPU_KERNEL(group_norm_grad, - ops::GroupNormGradNPUKernel, - ops::GroupNormGradNPUKernel); diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc deleted file mode 100644 index 4812dfa47dfed..0000000000000 --- a/paddle/fluid/operators/huber_loss_op_npu.cc +++ /dev/null @@ -1,144 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void HuberLossSub(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - // Calculate z = x - y - z->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); -} - -template -void HuberLossMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scalar, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); -} - -template -void HuberLossZerosLike(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - phi::DenseTensor* y) { - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {}); - runner.Run(stream); -} - -template -void HuberLossSmoothL1Loss(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - float delta, - phi::DenseTensor* z) { - z->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}}); - runner.Run(stream); -} - -template -void HuberLossSmoothL1LossGrad(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* pred, - const phi::DenseTensor* lab, - const phi::DenseTensor* dout, - float sigma, - phi::DenseTensor* grad) { - grad->mutable_data(pred->dims(), place); - const auto& runner = NpuOpRunner( - "SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}}); - runner.Run(stream); -} - -template -class HuberLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in0 = ctx.Input("X"); - auto* in1 = ctx.Input("Y"); - auto* residual = ctx.Output("Residual"); - auto* out = ctx.Output("Out"); - auto delta = ctx.Attr("delta"); - - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - HuberLossSub(place, stream, in1, in0, residual); - - HuberLossSmoothL1Loss(place, stream, in0, in1, delta, out); - HuberLossMuls(place, stream, out, delta, out); - } -}; - -template -class HuberLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* residual = ctx.Input("Residual"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto delta = ctx.Attr("delta"); - - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - - phi::DenseTensor t_grad_rd; - if (dx || dy) { - phi::DenseTensor t_zero; - HuberLossZerosLike(place, stream, residual, &t_zero); - HuberLossSmoothL1LossGrad( - place, stream, residual, &t_zero, dout, delta, &t_grad_rd); - } - if (dx) { - HuberLossMuls(place, stream, &t_grad_rd, -delta, dx); - } - if (dy) { - HuberLossMuls(place, stream, &t_grad_rd, delta, dy); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(huber_loss, - ops::HuberLossNPUKernel, - ops::HuberLossNPUKernel); -REGISTER_OP_NPU_KERNEL(huber_loss_grad, - ops::HuberLossGradNPUKernel, - ops::HuberLossGradNPUKernel); diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc deleted file mode 100644 index 7188fe38fdc68..0000000000000 --- a/paddle/fluid/operators/increment_op_npu.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class IncrementalNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_tensor = context.Input("X"); - auto* out_tensor = context.Output("Out"); - float step = context.Attr("step"); - out_tensor->mutable_data(context.GetPlace()); - - Tensor step_tensor(x_tensor->dtype()); - - step_tensor.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&step_tensor, static_cast(step)); - - const auto& runner = - NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - increment, - paddle::operators::IncrementalNPUKernel, - paddle::operators::IncrementalNPUKernel, - paddle::operators::IncrementalNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::IncrementalNPUKernel, -#endif - paddle::operators::IncrementalNPUKernel) diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc deleted file mode 100644 index 2a77ff82d0fa3..0000000000000 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(increment); -USE_OP_DEVICE_KERNEL(increment, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - init.push_back(static_cast(1.0)); - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({1}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - f::AttributeMap attr_input = {{"step", static_cast(2.0)}}; - auto op = f::OpRegistry::CreateOp( - "increment", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attr_input); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1); - EXPECT_EQ(out_vec[0], static_cast(3.0)); -} - -TEST(increment, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "increment"); -} - -TEST(increment, NPU_fp64) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "increment"); -} diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc deleted file mode 100644 index 64a50041421b3..0000000000000 --- a/paddle/fluid/operators/index_sample_op_npu.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* index, - const phi::DenseTensor* input, - phi::DenseTensor* out) { - auto index_dims = index->dims(); - auto input_dims = input->dims(); - auto batch_size = input_dims[0]; - auto index_length = index_dims[1]; - - std::vector gather_index_vec; - std::vector index_vec; - framework::TensorToVector(*index, dev_ctx, &index_vec); - for (auto i = 0; i < batch_size; ++i) { - for (auto j = 0; j < index_length; j++) { - gather_index_vec.push_back(i); - gather_index_vec.push_back(index_vec[i * index_length + j]); - } - } - phi::DenseTensor gather_index; - framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index); - gather_index.Resize({batch_size, index_length, 2}); - - NpuOpRunner runner; - runner.SetType("GatherNd") - .AddInput(*input) - .AddInput(gather_index) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); -} - -template -class IndexSampleNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* input = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGather(dev_ctx, index, input, out); - } else { - IndexSampleGather(dev_ctx, index, input, out); - } - } -}; - -template -void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx, - const phi::DenseTensor* index, - const phi::DenseTensor* out_grad, - phi::DenseTensor* x_grad) { - auto index_dims = index->dims(); - auto input_dims = x_grad->dims(); - auto batch_size = input_dims[0]; - auto index_length = index_dims[1]; - - std::vector scatter_index_vec; - std::vector index_vec; - framework::TensorToVector(*index, dev_ctx, &index_vec); - for (auto i = 0; i < batch_size; ++i) { - for (auto j = 0; j < index_length; j++) { - scatter_index_vec.push_back(i); - scatter_index_vec.push_back(index_vec[i * index_length + j]); - } - } - phi::DenseTensor scatter_index; - framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index); - scatter_index.Resize({batch_size, index_length, 2}); - - NpuOpRunner runner; - runner.SetType("ScatterNd") - .AddInput(scatter_index) - .AddInput(*out_grad) - .AddInput(phi::vectorize(x_grad->dims())) - .AddOutput(*x_grad); - runner.Run(dev_ctx.stream()); -} - -template -class IndexSampleGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* index = ctx.Input("Index"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - x_grad->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGradScatter(dev_ctx, index, out_grad, x_grad); - } else { - IndexSampleGradScatter(dev_ctx, index, out_grad, x_grad); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(index_sample, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel, - ops::IndexSampleNPUKernel); -REGISTER_OP_NPU_KERNEL(index_sample_grad, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel, - ops::IndexSampleGradNPUKernel); diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc deleted file mode 100644 index dd9c5608a0469..0000000000000 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ /dev/null @@ -1,161 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class IndexSelectNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto dim = ctx.Attr("dim"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(*x) - .AddInput(*index) - .AddInput(std::vector{dim}) - .AddOutput(*out); - runner.Run(stream); - } -}; - -template -class IndexSelectGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x_grad = ctx.Output(framework::GradVarName("X")); - auto* index = ctx.Input("Index"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - - auto stream = - ctx.template device_context() - .stream(); - - auto x_dims = x_grad->dims(); - auto out_dims = out_grad->dims(); - - int dim = ctx.Attr("dim"); - if (dim < 0) { - dim += out_dims.size(); - } - - phi::DenseTensor casted_index; - if (framework::TransToProtoVarType(index->dtype()) != - framework::proto::VarType::INT32) { - casted_index.mutable_data(index->dims(), ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*index}, {casted_index}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(stream); - } else { - casted_index.ShareDataWith(*index); - } - - if (dim == 0) { - x_grad->mutable_data(ctx.GetPlace()); - const auto& zeros_runner = NpuOpRunner("ZerosLike", {*x_grad}, {*x_grad}); - zeros_runner.Run(stream); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(*out_grad) - .AddInput(casted_index) - .AddInput(std::vector{x_dims[dim]}) - .AddOutput(*x_grad); - runner.Run(stream); - } else { - phi::DenseTensor transed_out_grad; - std::vector in_trans_perm; - in_trans_perm.push_back(dim); - for (int i = 0; i < out_dims.size(); ++i) { - if (i == dim) continue; - in_trans_perm.push_back(i); - } - framework::DDim transed_out_dims(out_dims); - for (size_t i = 0; i < in_trans_perm.size(); ++i) { - transed_out_dims[i] = out_dims[in_trans_perm[i]]; - } - transed_out_grad.mutable_data(transed_out_dims, ctx.GetPlace()); - NpuOpRunner in_trans_runner; - in_trans_runner.SetType("Transpose") - .AddInput(*out_grad) - .AddInput(std::move(in_trans_perm)) - .AddOutput(transed_out_grad); - in_trans_runner.Run(stream); - - phi::DenseTensor sum_out; - framework::DDim sum_dims(x_dims); - sum_dims[0] = x_dims[dim]; - auto idx = 1; - for (int i = 0; i < x_dims.size(); ++i) { - if (i == dim) continue; - sum_dims[idx++] = x_dims[i]; - } - sum_out.mutable_data(sum_dims, ctx.GetPlace()); - const auto& zeros_runner = NpuOpRunner("ZerosLike", {sum_out}, {sum_out}); - zeros_runner.Run(stream); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(transed_out_grad) - .AddInput(casted_index) - .AddInput(std::vector{x_dims[dim]}) - .AddOutput(sum_out); - runner.Run(stream); - - std::vector out_trans_perm; - for (int i = 1; i < 1 + dim; ++i) { - out_trans_perm.push_back(i); - } - out_trans_perm.push_back(0); - for (int i = 1 + dim; i < x_dims.size(); ++i) { - out_trans_perm.push_back(i); - } - x_grad->mutable_data(ctx.GetPlace()); - NpuOpRunner out_trans_runner; - out_trans_runner.SetType("Transpose") - .AddInput(sum_out) - .AddInput(std::move(out_trans_perm)) - .AddOutput(*x_grad); - out_trans_runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - index_select, - ops::IndexSelectNPUKernel, - ops::IndexSelectNPUKernel, - ops::IndexSelectNPUKernel); -REGISTER_OP_NPU_KERNEL( - index_select_grad, - ops::IndexSelectGradNPUKernel, - ops::IndexSelectGradNPUKernel, - ops::IndexSelectGradNPUKernel); diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc deleted file mode 100644 index 03307895f09e2..0000000000000 --- a/paddle/fluid/operators/instance_norm_op_npu.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class InstanceNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto epsilon = ctx.Attr("epsilon"); - const auto* x = ctx.Input("X"); - const auto* scale = ctx.Input("Scale"); - const auto* bias = ctx.Input("Bias"); - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("SavedMean"); - auto* variance = ctx.Output("SavedVariance"); - auto& dev_ctx = ctx.template device_context(); - - dev_ctx.template Alloc(y); - dev_ctx.template Alloc(mean); - dev_ctx.template Alloc(variance); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - - PADDLE_ENFORCE(x_dims.size() <= 5 && x_dims.size() >= 3, - platform::errors::InvalidArgument( - "InstanceNorm only supports the dimension of input " - " less equal to 5 and greater equal to 3. the dimension " - "of input is %d.", - x_dims.size())); - - auto tmp_x_dims = phi::vectorize(x_dims); - auto tmp_y_dims = phi::vectorize(y_dims); - if (x_dims.size() < 5) { - for (size_t i = x_dims.size(); i < 5; ++i) { - tmp_x_dims.insert(tmp_x_dims.begin() + 2, 1); - tmp_y_dims.insert(tmp_y_dims.begin() + 2, 1); - } - } - - phi::DenseTensor tmp_x, tmp_y; - tmp_x.ShareDataWith(*x); - - tmp_x.Resize(phi::make_ddim(tmp_x_dims)); - tmp_x.set_layout(phi::DataLayout::NCDHW); - tmp_y.ShareDataWith(*y); - tmp_y.Resize(phi::make_ddim(tmp_y_dims)); - tmp_y.set_layout(phi::DataLayout::NCDHW); - - NpuOpRunner runner; - - runner.SetType("InstanceNorm") - .AddInput(tmp_x) - .AddInput(*scale) - .AddInput(*bias) - .AddAttr("data_format", std::string("NCDHW")) - .AddAttr("epsilon", epsilon) - .AddOutput(tmp_y) - .AddOutput(*mean) - .AddOutput(*variance); - runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - instance_norm, - ops::InstanceNormNPUKernel, - ops::InstanceNormNPUKernel); diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc deleted file mode 100644 index 108efafff683f..0000000000000 --- a/paddle/fluid/operators/interpolate_op_npu.cc +++ /dev/null @@ -1,226 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/interpolate_op.h" - -namespace paddle { -namespace operators { -using DataLayout = phi::DataLayout; - -inline static void CheckArgument(const framework::ExecutionContext& ctx) { - const std::string interp_method = ctx.Attr("interp_method"); -#if (CANN_VERSION_CODE < 512000) - bool align_corners = ctx.Attr("align_corners"); - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); -#endif - PADDLE_ENFORCE_EQ( - interp_method, - "nearest", - platform::errors::InvalidArgument( - "NPU Interpolate Kernel only support nearest interpolotion.")); -} - -inline static void ExtractNCHW(const framework::DDim& dims, - const DataLayout& data_layout, - int32_t* n, - int32_t* c, - int32_t* h, - int32_t* w) { - *n = dims[0]; - if (data_layout == DataLayout::kNCHW) { - *c = dims[1]; - *h = dims[2]; - *w = dims[3]; - } else { // kNHWC - *h = dims[1]; - *w = dims[2]; - *c = dims[3]; - } -} - -static void CalcOutSize(const framework::ExecutionContext& ctx, - int32_t in_h, - int32_t in_w, - int32_t* out_h, - int32_t* out_w) { - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - *out_h = ctx.Attr("out_h"); - *out_w = ctx.Attr("out_w"); - - auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - - if (list_new_size_tensor.size() > 0) { - std::vector new_size_h(1); - std::vector new_size_w(1); - framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &new_size_h); - framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &new_size_w); - *out_h = new_size_h[0]; - *out_w = new_size_w[0]; - } else { - float scale; - auto scale_tensor = ctx.Input("Scale"); - if (scale_tensor != nullptr) { - std::vector scale_data; - framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data); - scale = scale_data[0]; - } else { - scale = ctx.Attr("scale"); - } - - if (scale > 0) { - *out_h = static_cast(in_h * scale); - *out_w = static_cast(in_w * scale); - } - - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - std::vector out_size_data; - framework::TensorToVector(*out_size, *dev_ctx, &out_size_data); - *out_h = out_size_data[0]; - *out_w = out_size_data[1]; - } - } - - PADDLE_ENFORCE_GT(*out_h, - 0, - platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(*out_w, - 0, - platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); -} - -template -class InterpolateNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // NOTE(Ruibiao): - // this kernel only support nearest interpolotion for 2D images - // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff - // when 'align_corners' is 'true' or data type is 'double' - CheckArgument(ctx); - - auto* input = ctx.Input("X"); - framework::DDim input_dims = input->dims(); - - const std::string data_layout_str = - ctx.Attr("data_layout"); // kNCHW or kNHWC - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - int32_t n, c, h, w, out_h, out_w; - ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); - CalcOutSize(ctx, h, w, &out_h, &out_w); - - // the 'input' tensor may has no set (or wrong set) of the layout - phi::DenseTensor input_x(input->type()); - input_x.ShareDataWith(*input); - input_x.set_layout(data_layout); - - auto* output = ctx.Output("Out"); - framework::DDim output_dims; - if (data_layout == DataLayout::kNCHW) { - output_dims = {n, c, out_h, out_w}; - } else { - output_dims = {n, out_h, out_w, c}; - } - output->set_layout(data_layout); - output->mutable_data(output_dims, ctx.GetPlace()); - - NpuOpRunner npu_op_runner; - auto npu_stream = - ctx.template device_context() - .stream(); - npu_op_runner.SetType("ResizeNearestNeighborV2") - .AddInput(input_x) - .AddInput(std::vector{out_h, out_w}) - .AddOutput(*output) - .AddAttr("align_corners", false) - .AddAttr("half_pixel_centers", false) - .Run(npu_stream); - } -}; - -template -class InterpolateGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // NOTE(Ruibiao): - // this kernel only support nearest interpolotion for 2D images - // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff - // when 'align_corners' is 'true' or data type is 'double' - CheckArgument(ctx); - - auto* input = ctx.Input("X"); - framework::DDim input_dims = input->dims(); - - const std::string data_layout_str = - ctx.Attr("data_layout"); // kNCHW or kNHWC - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - - int32_t n, c, h, w, out_h, out_w; - ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w); - CalcOutSize(ctx, h, w, &out_h, &out_w); - - // the 'output_grad' tensor may has no set (or wrong set) of the layout - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - phi::DenseTensor output_grad_tmp(output_grad->type()); - output_grad_tmp.ShareDataWith(*output_grad); - output_grad_tmp.set_layout(data_layout); - - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - input_grad->set_layout(data_layout); - framework::DDim input_grad_dims; - if (data_layout == DataLayout::kNCHW) { - input_grad_dims = {n, c, h, w}; - } else { - input_grad_dims = {n, h, w, c}; - } - input_grad->mutable_data(input_grad_dims, ctx.GetPlace()); - - NpuOpRunner npu_op_runner; - auto npu_stream = - ctx.template device_context() - .stream(); - npu_op_runner.SetType("ResizeNearestNeighborV2Grad") - .AddInput(output_grad_tmp) - .AddInput(std::vector{h, w}) - .AddOutput(*input_grad) - .AddAttr("align_corners", false) - .AddAttr("half_pixel_centers", false) - .Run(npu_stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(nearest_interp, - ops::InterpolateNPUKernel, - ops::InterpolateNPUKernel); -REGISTER_OP_NPU_KERNEL(nearest_interp_grad, - ops::InterpolateGradNPUKernel); diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc deleted file mode 100644 index d16494f229e42..0000000000000 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ /dev/null @@ -1,812 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/interpolate_function.h" - -namespace paddle { -namespace operators { - -using DataLayout = phi::DataLayout; -using DDim = framework::DDim; -using fp16 = paddle::platform::float16; - -template -struct InterpolateFunction { - public: - explicit InterpolateFunction(const framework::ExecutionContext& ctx) - : ctx(ctx) { - place = ctx.GetPlace(); - stream = ctx.template device_context() - .stream(); - t0.mutable_data({1}, place); - t1.mutable_data({1}, place); - tn.mutable_data({1}, place); - FillNpuTensorWithConstant(&t0, static_cast(0)); - FillNpuTensorWithConstant(&t1, static_cast(1)); - } - void Arange(int n, phi::DenseTensor* x) { - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); - runner.Run(stream); - } - void ReduceSum(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& dim, - bool keep_dims = true) { - const auto& runner = NpuOpRunner( - "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - void Add(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Mul(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Sub(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(y->dtype())); - const auto& runner = NpuOpRunner( - "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); - runner.Run(stream); - } - void Gather(const phi::DenseTensor* x, - const phi::DenseTensor* indices, - const int axis, - phi::DenseTensor* y) { - const auto& runner = - NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}}); - runner.Run(stream); - } - void GatherGrad(const phi::DenseTensor* gy, - const phi::DenseTensor* indices, - const int axis, - phi::DenseTensor* gx) { - // 1 gy swapaxis: axis & 0 - int len = (gy->dims()).size(); - std::vector axis_swap(len); - for (int i = 0; i < len; i++) { - axis_swap[i] = i; - } - axis_swap[0] = axis; - axis_swap[axis] = 0; - auto y_new_shape = gy->dims(); - auto yt = y_new_shape[axis]; - y_new_shape[axis] = y_new_shape[0]; - y_new_shape[0] = yt; - phi::DenseTensor gy_t; - gy_t.mutable_data(y_new_shape, place); - Transpose(gy, &gy_t, axis_swap); - // 2 scatter - auto x_new_shape = gx->dims(); - auto xt = x_new_shape[axis]; - x_new_shape[axis] = x_new_shape[0]; - x_new_shape[0] = xt; - phi::DenseTensor gx_zero, gx_t; - gx_zero.mutable_data(x_new_shape, place); - gx_t.mutable_data(x_new_shape, place); - FillNpuTensorWithConstant(&gx_zero, static_cast(0)); - gx_zero.Resize(x_new_shape); - Scatter(&gx_zero, indices, &gy_t, &gx_t); - // 3 gx swapaxis: axis, 0 - Transpose(&gx_t, gx, axis_swap); - } - void Scatter(const phi::DenseTensor* x, - const phi::DenseTensor* index, - const phi::DenseTensor* updates, - phi::DenseTensor* y) { - const auto& runner = - NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {}); - runner.Run(stream); - } - void Transpose(const phi::DenseTensor* x, - phi::DenseTensor* y, - const std::vector& axis) { - const auto& runner = - NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); - runner.Run(stream); - } - void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); - runner.Run(stream); - } - void Maximum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Minimum(const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); - runner.Run(stream); - } - void Floor(const phi::DenseTensor* x, phi::DenseTensor* y) { - const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {}); - runner.Run(stream); - } - - private: - platform::Place place; - aclrtStream stream; - const framework::ExecutionContext& ctx; - phi::DenseTensor t0; - phi::DenseTensor t1; - phi::DenseTensor tn; -}; - -template <> -void InterpolateFunction::Arange(int n, phi::DenseTensor* x) { - phi::DenseTensor x_fp32(phi::DataType::FLOAT32); - x_fp32.mutable_data(x->dims(), place); - FillNpuTensorWithConstant(&tn, static_cast(n)); - const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); - runner.Run(stream); - Cast(&x_fp32, x); -} - -void InterpolateParamCompute(const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout, - const DDim& indim, - const DDim& outdim, - int* axis_h, - int* axis_w, - int* in_h, - int* in_w, - int* out_h, - int* out_w, - float* ratio_h, - float* ratio_w) { - if (data_layout == DataLayout::kNCHW) { - *axis_h = 2; - *axis_w = 3; - } else { - *axis_h = 1; - *axis_w = 2; - } - *out_h = outdim[*axis_h]; - *out_w = outdim[*axis_w]; - *in_h = indim[*axis_h]; - *in_w = indim[*axis_w]; - *ratio_h = 0.0f; - *ratio_w = 0.0f; - if (*out_h > 1) { - *ratio_h = - align_corners - ? static_cast(*in_h - 1) / (*out_h - 1) - : (scale_h > 0 ? 1 / scale_h : static_cast(*in_h) / *out_h); - } - if (*out_w > 1) { - *ratio_w = - align_corners - ? static_cast(*in_w - 1) / (*out_w - 1) - : (scale_w > 0 ? 1 / scale_w : static_cast(*in_w) / *out_w); - } -} - -template -void BilinearParamTensorCompute(const framework::ExecutionContext& ctx, - const DataLayout& data_layout, - int in_h, - int in_w, - int out_h, - int out_w, - bool align_cond, - float ratio_h, - float ratio_w, - phi::DenseTensor* h0, - phi::DenseTensor* h1, - phi::DenseTensor* w0, - phi::DenseTensor* w1, - phi::DenseTensor* coef_h0, - phi::DenseTensor* coef_h1, - phi::DenseTensor* coef_w0, - phi::DenseTensor* coef_w1) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - phi::DenseTensor _h0, _w0; - _h0.mutable_data({out_h}, place); - _w0.mutable_data({out_w}, place); - F.Arange(out_h, &_h0); - F.Arange(out_w, &_w0); - if (align_cond) { - F.Adds(&_h0, static_cast(0.5), &_h0); - F.Adds(&_w0, static_cast(0.5), &_w0); - F.Muls(&_h0, ratio_h, &_h0); - F.Muls(&_w0, ratio_w, &_w0); - F.Adds(&_h0, static_cast(-0.5), &_h0); - F.Adds(&_w0, static_cast(-0.5), &_w0); - } else { - F.Muls(&_h0, ratio_h, &_h0); - F.Muls(&_w0, ratio_w, &_w0); - } - - phi::DenseTensor zero_t; - phi::DenseTensor one_t; - zero_t.mutable_data({1}, place); - one_t.mutable_data({1}, place); - FillNpuTensorWithConstant(&zero_t, static_cast(0)); - FillNpuTensorWithConstant(&one_t, static_cast(1)); - F.Maximum(&_h0, &zero_t, &_h0); - F.Maximum(&_w0, &zero_t, &_w0); - - phi::DenseTensor _h0_floor, _w0_floor; - _h0_floor.mutable_data({out_h}, place); - _w0_floor.mutable_data({out_w}, place); - F.Floor(&_h0, &_h0_floor); - F.Floor(&_w0, &_w0_floor); - F.Cast(&_h0_floor, h0); - F.Cast(&_w0_floor, w0); - - phi::DenseTensor one_int; - one_int.mutable_data({1}, place); - FillNpuTensorWithConstant(&one_int, static_cast(1)); - F.Add(h0, &one_int, h1); - F.Add(w0, &one_int, w1); - phi::DenseTensor t_max_h, t_max_w; - t_max_h.mutable_data({1}, place); - t_max_w.mutable_data({1}, place); - FillNpuTensorWithConstant(&t_max_h, static_cast(in_h - 1)); - FillNpuTensorWithConstant(&t_max_w, static_cast(in_w - 1)); - F.Minimum(h1, &t_max_h, h1); - F.Minimum(w1, &t_max_w, w1); - - F.Sub(&_h0, &_h0_floor, coef_h1); - F.Sub(&_w0, &_w0_floor, coef_w1); - F.Sub(&one_t, coef_h1, coef_h0); - F.Sub(&one_t, coef_w1, coef_w0); - - if (data_layout == DataLayout::kNCHW) { - coef_h0->Resize({out_h, 1}); - coef_h1->Resize({out_h, 1}); - } else { - coef_h0->Resize({out_h, 1, 1}); - coef_h1->Resize({out_h, 1, 1}); - coef_w0->Resize({out_w, 1}); - coef_w1->Resize({out_w, 1}); - } -} - -template -void BilinearFwdNpu(const framework::ExecutionContext& ctx, - const phi::DenseTensor* input, - phi::DenseTensor* output, - const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - auto outdim = output->dims(); - auto indim = input->dims(); - - int axis_h, axis_w; - int out_h, out_w, in_h, in_w; - float ratio_h, ratio_w; - InterpolateParamCompute(scale_h, - scale_w, - align_corners, - align_mode, - data_layout, - indim, - outdim, - &axis_h, - &axis_w, - &in_h, - &in_w, - &out_h, - &out_w, - &ratio_h, - &ratio_w); - - phi::DenseTensor h0, h1, w0, w1; - h0.mutable_data({out_h}, place); - h1.mutable_data({out_h}, place); - w0.mutable_data({out_w}, place); - w1.mutable_data({out_w}, place); - phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1; - coef_h0.mutable_data({out_h}, place); - coef_h1.mutable_data({out_h}, place); - coef_w0.mutable_data({out_w}, place); - coef_w1.mutable_data({out_w}, place); - bool align_cond = align_mode == 0 && !align_corners; - BilinearParamTensorCompute(ctx, - data_layout, - in_h, - in_w, - out_h, - out_w, - align_cond, - ratio_h, - ratio_w, - &h0, - &h1, - &w0, - &w1, - &coef_h0, - &coef_h1, - &coef_w0, - &coef_w1); - - phi::DenseTensor input_gather_h0, input_gather_h1; - auto dim_gather_h = indim; - dim_gather_h[axis_h] = out_h; - input_gather_h0.mutable_data(dim_gather_h, place); - input_gather_h1.mutable_data(dim_gather_h, place); - - F.Gather(input, &h0, axis_h, &input_gather_h0); - F.Gather(input, &h1, axis_h, &input_gather_h1); - - F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0); - F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1); - phi::DenseTensor out_x4; - out_x4.mutable_data({4, outdim[0], outdim[1], outdim[2], outdim[3]}, - place); - phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1); - phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2); - phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3); - phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4); - F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0); - F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1); - F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0); - F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1); - F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0); - F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1); - F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0); - F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1); - F.ReduceSum(&out_x4, output, std::vector{0}, false); -} - -template -void BilinearBwdNpu(const framework::ExecutionContext& ctx, - const phi::DenseTensor* gout, - phi::DenseTensor* gin, - const float scale_h, - const float scale_w, - const bool align_corners, - const int align_mode, - const DataLayout& data_layout) { - InterpolateFunction F(ctx); - auto place = ctx.GetPlace(); - auto outdim = gout->dims(); - auto indim = gin->dims(); - - int axis_h, axis_w; - int out_h, out_w, in_h, in_w; - float ratio_h, ratio_w; - InterpolateParamCompute(scale_h, - scale_w, - align_corners, - align_mode, - data_layout, - indim, - outdim, - &axis_h, - &axis_w, - &in_h, - &in_w, - &out_h, - &out_w, - &ratio_h, - &ratio_w); - - phi::DenseTensor h0, h1, w0, w1; - h0.mutable_data({out_h}, place); - h1.mutable_data({out_h}, place); - w0.mutable_data({out_w}, place); - w1.mutable_data({out_w}, place); - phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1; - coef_h0.mutable_data({out_h}, place); - coef_h1.mutable_data({out_h}, place); - coef_w0.mutable_data({out_w}, place); - coef_w1.mutable_data({out_w}, place); - bool align_cond = align_mode == 0 && !align_corners; - BilinearParamTensorCompute(ctx, - data_layout, - in_h, - in_w, - out_h, - out_w, - align_cond, - ratio_h, - ratio_w, - &h0, - &h1, - &w0, - &w1, - &coef_h0, - &coef_h1, - &coef_w0, - &coef_w1); - - phi::DenseTensor gy_w0, gy_w1; - gy_w0.mutable_data(outdim, place); - gy_w1.mutable_data(outdim, place); - F.Mul(gout, &coef_w0, &gy_w0); - F.Mul(gout, &coef_w1, &gy_w1); - - auto dim_gather_h = indim; - dim_gather_h[axis_h] = out_h; - phi::DenseTensor g_gather_w0, g_gather_w1; - g_gather_w0.mutable_data(dim_gather_h, place); - g_gather_w1.mutable_data(dim_gather_h, place); - w0.Resize({out_w, 1}); - w1.Resize({out_w, 1}); - F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0); - F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1); - - F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0); - F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1); - F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0); - - phi::DenseTensor gx_0, gx_1; - gx_0.mutable_data(indim, place); - gx_1.mutable_data(indim, place); - h0.Resize({out_h, 1}); - h1.Resize({out_h, 1}); - F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0); - F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1); - - F.Add(&gx_0, &gx_1, gin); -} - -template -class InterpolateV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - auto input_dims = input->dims(); - PADDLE_ENFORCE_EQ( - input_dims.size(), - 4UL, - platform::errors::External( - "NPU Interpolate Kernel only support 4-D phi::DenseTensor.")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - phi::funcs::ExtractNCDWH( - input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - - // To-do(qili93): need to support align_corners = true case, try ReSizeD - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_shape_tensor.size() > 0) { - std::vector output_h(1); - std::vector output_w(1); - auto dev_ctx = - platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - framework::TensorToVector(*list_new_shape_tensor[0], *dev_ctx, &output_h); - framework::TensorToVector(*list_new_shape_tensor[1], *dev_ctx, &output_w); - out_h = output_h[0]; - out_w = output_w[0]; - } else if (ctx.HasInput("OutSize")) { - auto out_size = ctx.Input("OutSize"); - auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = - phi::funcs::get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_h = scale_data[0]; - scale_w = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - } - PADDLE_ENFORCE_GT(out_h, - 0, - platform::errors::InvalidArgument( - "out_h in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - PADDLE_ENFORCE_GT(out_w, - 0, - platform::errors::InvalidArgument( - "out_w in Attr(out_shape) of Op(interpolate) " - "should be greater than 0.")); - framework::DDim dim_out; - if (data_layout == DataLayout::kNCHW) { - dim_out = {n, c, out_h, out_w}; - } else { - dim_out = {n, out_h, out_w, c}; - } - output->mutable_data(dim_out, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*input, ctx.GetPlace(), output); - return; - } - - auto stream = - ctx.template device_context() - .stream(); - - // To-do(qili93): need to support bilineare, try ResizeD - // Add bilineare by zhulei - if ("nearest" == interp_method) { - NpuOpRunner runner; - runner.SetType("ResizeNearestNeighborV2") - .AddInput(*input) - .AddInput(std::vector{out_h, out_w}) - .AddOutput(*output) - .AddAttr("align_corners", align_corners) - .AddAttr("half_pixel_centers", false); - runner.Run(stream); - } else if ("bilinear" == interp_method) { - int align_mode = ctx.Attr("align_mode"); - BilinearFwdNpu(ctx, - input, - output, - scale_h, - scale_w, - align_corners, - align_mode, - data_layout); - } - } -}; - -template -class InterpolateV2NPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = phi::StringToDataLayout(data_layout_str); - int n, c, in_d, in_h, in_w; - phi::funcs::ExtractNCDWH( - input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); - - auto interp_method = ctx.Attr("interp_method"); - bool align_corners = ctx.Attr("align_corners"); - - // To-do(qili93): need to support align_corners = true case, try ReSizeD - PADDLE_ENFORCE_EQ( - align_corners, - false, - platform::errors::InvalidArgument( - "NPU Interpolate Kernel has diff when align_corners is true.")); - - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale_h = -1; - float scale_w = -1; - - // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w - auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); - if (list_new_size_tensor.size() > 0) { - std::vector output_h(1); - std::vector output_w(1); - auto dev_ctx = - platform::DeviceContextPool::Instance().Get(ctx.GetPlace()); - framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &output_h); - framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &output_w); - out_h = output_h[0]; - out_w = output_w[0]; - } else if (ctx.HasInput("OutSize")) { - auto out_size = ctx.Input("OutSize"); - auto out_size_data = phi::funcs::get_new_data_from_tensor(out_size); - out_h = out_size_data[0]; - out_w = out_size_data[1]; - } else { - auto scale_tensor = ctx.Input("Scale"); - auto scale = ctx.Attr>("scale"); - if (scale_tensor != nullptr) { - auto scale_data = - phi::funcs::get_new_data_from_tensor(scale_tensor); - if (scale_data.size() > 1) { - scale_h = scale_data[0]; - scale_w = scale_data[1]; - } else { - scale_w = scale_data[0]; - scale_h = scale_data[0]; - } - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in input 'Scale' phi::DenseTensor of " - "Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } else { - if (scale.size() > 1) { - scale_h = scale[0]; - scale_w = scale[1]; - PADDLE_ENFORCE_EQ( - scale_w > 0, - true, - platform::errors::InvalidArgument( - "The scale_w in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_w)); - PADDLE_ENFORCE_EQ( - scale_h > 0, - true, - platform::errors::InvalidArgument( - "The scale_h in Attr(scale) of Operator(interpolate) " - "should be greater than 0, but received value is %d.", - scale_h)); - } - } - if (scale_h > 0. && scale_w > 0.) { - out_h = static_cast(in_h * scale_h); - out_w = static_cast(in_w * scale_w); - } - } - - framework::DDim dim_grad; - if (data_layout == DataLayout::kNCHW) { - dim_grad = {n, c, in_h, in_w}; - } else { - dim_grad = {n, in_h, in_w, c}; - } - - input_grad->mutable_data(dim_grad, ctx.GetPlace()); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); - return; - } - - auto stream = - ctx.template device_context() - .stream(); - - // To-do(qili93): need to support bilineare, try ResizeGradD - if ("nearest" == interp_method) { - NpuOpRunner runner; - runner.SetType("ResizeNearestNeighborV2Grad") - .AddInput(*output_grad) - .AddInput(std::vector{in_h, in_w}) - .AddOutput(*input_grad) - .AddAttr("align_corners", align_corners) - .AddAttr("half_pixel_centers", false); - runner.Run(stream); - } else if ("bilinear" == interp_method) { - int align_mode = ctx.Attr("align_mode"); - BilinearBwdNpu(ctx, - output_grad, - input_grad, - scale_h, - scale_w, - align_corners, - align_mode, - data_layout); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - nearest_interp_v2, - ops::InterpolateV2NPUKernel, - ops::InterpolateV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - nearest_interp_v2_grad, - ops::InterpolateV2NPUGradKernel, - ops::InterpolateV2NPUGradKernel); - -REGISTER_OP_NPU_KERNEL( - bilinear_interp_v2, - ops::InterpolateV2NPUKernel, - ops::InterpolateV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - bilinear_interp_v2_grad, - ops::InterpolateV2NPUGradKernel, - ops::InterpolateV2NPUGradKernel); diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc deleted file mode 100644 index 91a0698d626f5..0000000000000 --- a/paddle/fluid/operators/is_empty_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - is_empty, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc deleted file mode 100644 index d2b4626c58cb4..0000000000000 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ /dev/null @@ -1,170 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class KLDivLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - loss->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - - if ("none" == reduction) { - // log(label) - auto ones_tensor = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& ones_runner = - NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {}); - ones_runner.Run(stream); - - auto sub_tensor = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& sub_runner = - NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {}); - sub_runner.Run(stream); - - auto log_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& log_runner = - NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {}); - log_runner.Run(stream); - - // log(label) - input - const auto& sub_runner2 = - NpuOpRunner("Sub", {log_target, *input}, {*loss}, {}); - sub_runner2.Run(stream); - - // label * (log(label) - input) - auto min_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - auto max_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - FillNpuTensorWithConstant(&min_value, static_cast(0)); - FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); - - auto cliped_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& clip_runner = NpuOpRunner( - "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); - clip_runner.Run(stream); - - const auto& mul_runner = - NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {}); - mul_runner.Run(stream); - } else if ("batchmean" == reduction || "sum" == reduction) { - const auto& runner = NpuOpRunner( - "KLDiv", {*input, *target}, {*loss}, {{"reduction", reduction}}); - runner.Run(stream); - } else if ("mean" == reduction) { - const auto& runner = NpuOpRunner("KLDiv", - {*input, *target}, - {*loss}, - {{"reduction", std::string("sum")}}); - runner.Run(stream); - - const int numel = input->numel(); - const auto& muls_runner = - NpuOpRunner("Muls", - {*loss}, - {*loss}, - {{"value", static_cast(1.0 / numel)}}); - muls_runner.Run(stream); - } - } -}; - -template -class KLDivLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* target = ctx.Input("Target"); - auto* loss_grad = - ctx.Input(framework::GradVarName("Loss")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - auto reduction = ctx.Attr("reduction"); - input_grad->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); - - phi::DenseTensor loss_grad_transformed; - if ("none" == reduction) { - loss_grad_transformed.ShareDataWith(*loss_grad); - } else { - loss_grad_transformed.mutable_data(input_grad->dims(), ctx.GetPlace()); - - NpuOpRunner broadcast_runner; - broadcast_runner.SetType("BroadcastTo"); - broadcast_runner.AddInput(*loss_grad); - broadcast_runner.AddInput(phi::vectorize(input_grad->dims())); - broadcast_runner.AddOutput(loss_grad_transformed); - broadcast_runner.Run(stream); - } - auto min_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - auto max_value = - ctx.AllocateTmpTensor({1}, dev_ctx); - FillNpuTensorWithConstant(&min_value, static_cast(0)); - FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); - - auto cliped_target = ctx.AllocateTmpTensor( - target->dims(), dev_ctx); - const auto& clip_runner = NpuOpRunner( - "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); - clip_runner.Run(stream); - - const auto& mul_runner = NpuOpRunner( - "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {}); - mul_runner.Run(stream); - - float k = -1.0f; - - if ("mean" == reduction) { - k = static_cast(-1.0 / input_grad->numel()); - } else if ("batchmean" == reduction) { - k = static_cast(-1.0 / input_grad->dims()[0]); - } - - const auto& muls_runner = - NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}}); - muls_runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(kldiv_loss, - ops::KLDivLossNPUKernel, - ops::KLDivLossNPUKernel); - -REGISTER_OP_NPU_KERNEL(kldiv_loss_grad, - ops::KLDivLossGradNPUKernel, - ops::KLDivLossGradNPUKernel); diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc deleted file mode 100644 index 5c267625f55f7..0000000000000 --- a/paddle/fluid/operators/label_smooth_op_npu.cc +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void LabelSmoothMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in, - float val, - phi::DenseTensor* out) { - out->mutable_data(in->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}}); - runner.Run(stream); -} - -template -void LabelSmoothAdds(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in, - float val, - phi::DenseTensor* out) { - out->mutable_data(in->dims(), place); - const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}}); - runner.Run(stream); -} - -template -void LabelSmoothAddBroadCast(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* in1, - const phi::DenseTensor* in2, - phi::DenseTensor* out) { - out->mutable_data(place); - const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {}); - runner.Run(stream); -} - -template -class LabelSmoothNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_t = ctx.Output("Out"); - auto* in_t = ctx.Input("X"); - auto* dist_t = ctx.Input("PriorDist"); - auto epsilon = ctx.Attr("epsilon"); - - auto label_dim = in_t->dims()[in_t->dims().size() - 1]; - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - if (dist_t) { - phi::DenseTensor tmp; - phi::DenseTensor dist; - phi::DenseTensor tmp2; - LabelSmoothMuls(place, stream, in_t, (1 - epsilon), &tmp); - LabelSmoothMuls(place, stream, dist_t, epsilon, &tmp2); - tmp2.Resize({1, label_dim}); - LabelSmoothAddBroadCast(place, stream, &tmp, &tmp2, out_t); - } else { - phi::DenseTensor tmp; - LabelSmoothMuls(place, stream, in_t, (1 - epsilon), &tmp); - LabelSmoothAdds(place, stream, &tmp, (epsilon / label_dim), out_t); - } - } -}; - -template -class LabelSmoothGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_out_t = ctx.Input(framework::GradVarName("Out")); - auto* d_in_t = ctx.Output(framework::GradVarName("X")); - auto epsilon = ctx.Attr("epsilon"); - - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - LabelSmoothMuls(place, stream, d_out_t, 1 - epsilon, d_in_t); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(label_smooth, - ops::LabelSmoothNPUKernel, - ops::LabelSmoothNPUKernel); -REGISTER_OP_NPU_KERNEL(label_smooth_grad, - ops::LabelSmoothGradNPUKernel, - ops::LabelSmoothGradNPUKernel); diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc deleted file mode 100644 index ca6762f2e325a..0000000000000 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ /dev/null @@ -1,449 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using DDim = framework::DDim; - -using DataLayout = phi::DataLayout; - -template -class NormDataType; - -template <> -class NormDataType { - public: - // The scaling param type is float for HALF and FLOAT tensors - using ScalingParamType = const float; - using BatchNormParamType = float; -}; - -template <> -class NormDataType { - public: - using ScalingParamType = const float; - using BatchNormParamType = float; -}; - -template -using NormDataType = NormDataType; -template -using LayerNormParamType = typename NormDataType::BatchNormParamType; - -template -class LayerNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using U = LayerNormParamType; - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - const auto epsilon = ctx.Attr("epsilon"); - const auto* x = ctx.Input("X"); - const auto* scale = ctx.Input("Scale"); - const auto* bias = ctx.Input("Bias"); - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* variance = ctx.Output("Variance"); - const auto& x_dims = x->dims(); - std::vector axes; - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int right = static_cast(matrix_dim[1]); - - // The shape of scale and bias should be equal to x.shape[begin_norm_axis:], - // required by Ascend. - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - axes.push_back(x_dims[i]); - } - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor default_scale(x->type()); - if (!scale) { - default_scale.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(1.0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); - runner.Run(stream); - scale = &default_scale; - } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); - } - - phi::DenseTensor default_bias(x->type()); - if (!bias) { - default_bias.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); - runner.Run(stream); - bias = &default_bias; - } else { - const_cast(bias)->Resize(phi::make_ddim(axes)); - } - - // cast scale from LayerNormParamType to T if needed - phi::DenseTensor cast_scale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32) { - cast_scale.Resize(scale->dims()); - cast_scale.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_scale = - NpuOpRunner("Cast", - {*scale}, - {cast_scale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_scale.Run(stream); - } else { - cast_scale.ShareDataWith(*scale); - } - - // cast bias from LayerNormParamType to T if needed - phi::DenseTensor cast_bias(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32) { - cast_bias.Resize(bias->dims()); - cast_bias.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_bias = - NpuOpRunner("Cast", - {*bias}, - {cast_bias}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_bias.Run(stream); - } else { - cast_bias.ShareDataWith(*bias); - } - - y->mutable_data(ctx.GetPlace()); - - // mean should be of U type - phi::DenseTensor* tmp_mean = mean; - phi::DenseTensor cast_mean(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32)) { - cast_mean.Resize(mean->dims()); - cast_mean.mutable_data(ctx.GetPlace()); - tmp_mean = &cast_mean; - mean->mutable_data(ctx.GetPlace()); - } else { - mean->mutable_data(ctx.GetPlace()); - } - - // same for variance - phi::DenseTensor* tmp_variance = variance; - phi::DenseTensor cast_variance(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(bias->dtype()) == - framework::proto::VarType::FP32)) { - cast_variance.Resize(variance->dims()); - cast_variance.mutable_data(ctx.GetPlace()); - tmp_variance = &cast_variance; - variance->mutable_data(ctx.GetPlace()); - } else { - variance->mutable_data(ctx.GetPlace()); - } - - const auto& runner = NpuOpRunner("LayerNorm", - {*x, cast_scale, cast_bias}, - {*y, *tmp_mean, *tmp_variance}, - {{"begin_norm_axis", begin_norm_axis}, - {"begin_params_axis", begin_norm_axis}, - {"epsilon", epsilon}}); - runner.Run(stream); - - // cast back from FP16 to FP32 - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(mean->type())); - const auto& runner_cast_mean = - NpuOpRunner("Cast", - {*tmp_mean}, - {*mean}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mean.Run(stream); - } - // same for variance - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(variance->type())); - const auto& runner_cast_variance = - NpuOpRunner("Cast", - {*tmp_variance}, - {*variance}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_variance.Run(stream); - } - - // revert shape of scale and bias - // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input - // tensor. - const_cast(scale)->Resize(phi::make_ddim({right})); - const_cast(bias)->Resize(phi::make_ddim({right})); - } -}; - -template -class LayerNormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using U = LayerNormParamType; - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - const auto* x = ctx.Input("X"); - const auto& x_dims = x->dims(); - const auto* mean = ctx.Input("Mean"); - const auto* variance = ctx.Input("Variance"); - const auto* scale = ctx.Input("Scale"); - const auto* dy = ctx.Input(framework::GradVarName("Y")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dscale = - ctx.Output(framework::GradVarName("Scale")); - auto* dbias = ctx.Output(framework::GradVarName("Bias")); - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int right = static_cast(matrix_dim[1]); - - std::vector axes; - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - axes.push_back(x_dims[i]); - } - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - // No need to compute any gradient, jusr return - if (!dx && !dscale && !dbias) { - return; - } - - // The rank of mean should be equal to x, required by Ascend. - std::vector new_shape; - for (auto i = 0; i < begin_norm_axis; ++i) { - new_shape.push_back(x_dims[i]); - } - for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { - new_shape.push_back(1); - } - - auto mean_dims = mean->dims(); - const_cast(mean)->Resize(phi::make_ddim({new_shape})); - const_cast(variance)->Resize( - phi::make_ddim({new_shape})); - - phi::DenseTensor default_scale(x->type()); - if (!scale) { - default_scale.mutable_data(phi::make_ddim(axes), place); - phi::DenseTensor value(x->type()); - value.mutable_data({1}, place); - FillNpuTensorWithConstant(&value, static_cast(1.0)); - const auto& runner = - NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); - runner.Run(stream); - scale = &default_scale; - } else { - const_cast(scale)->Resize(phi::make_ddim(axes)); - } - - // cast scale from LayerNormParamType to T if needed - phi::DenseTensor cast_scale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(scale->dtype()) == - framework::proto::VarType::FP32) { - cast_scale.Resize(scale->dims()); - cast_scale.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_scale = - NpuOpRunner("Cast", - {*scale}, - {cast_scale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_scale.Run(stream); - } else { - cast_scale.ShareDataWith(*scale); - } - - // cast mean from LayerNormParamType to T if needed - phi::DenseTensor cast_mean(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32) { - cast_mean.Resize(mean->dims()); - cast_mean.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_mean = - NpuOpRunner("Cast", - {*mean}, - {cast_mean}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_mean.Run(stream); - } else { - cast_mean.ShareDataWith(*mean); - } - - // cast variance from LayerNormParamType to T if needed - phi::DenseTensor cast_variance(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32) { - cast_variance.Resize(variance->dims()); - cast_variance.mutable_data(ctx.GetPlace()); - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(x->type())); - const auto& runner_cast_variance = - NpuOpRunner("Cast", - {*variance}, - {cast_variance}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_variance.Run(stream); - } else { - cast_variance.ShareDataWith(*variance); - } - - phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type()); - dx = (dx == nullptr) ? &dx_ : dx; - dscale = (dscale == nullptr) ? &dscale_ : dscale; - dbias = (dbias == nullptr) ? &dbias_ : dbias; - - dx->Resize(x->dims()); - dx->mutable_data(ctx.GetPlace()); - - dscale->Resize(phi::make_ddim(axes)); - - dbias->Resize(phi::make_ddim(axes)); - - // dscale should be of U type - phi::DenseTensor* tmp_dscale = dscale; - phi::DenseTensor cast_dscale(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32)) { - cast_dscale.Resize(dscale->dims()); - cast_dscale.mutable_data(ctx.GetPlace()); - tmp_dscale = &cast_dscale; - dscale->mutable_data(ctx.GetPlace()); - } else { - dscale->mutable_data(ctx.GetPlace()); - } - - // same for dbias - phi::DenseTensor* tmp_dbias = dbias; - phi::DenseTensor cast_dbias(x->type()); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - (framework::TransToProtoVarType(mean->dtype()) == - framework::proto::VarType::FP32 || - framework::TransToProtoVarType(variance->dtype()) == - framework::proto::VarType::FP32)) { - cast_dbias.Resize(dbias->dims()); - cast_dbias.mutable_data(ctx.GetPlace()); - tmp_dbias = &cast_dbias; - dbias->mutable_data(ctx.GetPlace()); - } else { - dbias->mutable_data(ctx.GetPlace()); - } - - const auto& runner = - NpuOpRunner("LayerNormGrad", - {*dy, *x, cast_variance, cast_mean, cast_scale}, - {*dx, *tmp_dscale, *tmp_dbias}, - {}); - runner.Run(stream); - - // cast back from FP16 to FP32 - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(dscale->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dscale->type())); - const auto& runner_cast_dscale = - NpuOpRunner("Cast", - {*tmp_dscale}, - {*dscale}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_dscale.Run(stream); - } - // same for dbias - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(dbias->dtype()) == - framework::proto::VarType::FP32) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(dbias->type())); - const auto& runner_cast_dbias = - NpuOpRunner("Cast", - {*tmp_dbias}, - {*dbias}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_dbias.Run(stream); - } - - const_cast(mean)->Resize(mean_dims); - const_cast(variance)->Resize(mean_dims); - const_cast(scale)->Resize(phi::make_ddim({right})); - dscale->Resize(phi::make_ddim({right})); - dbias->Resize(phi::make_ddim({right})); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(layer_norm, - ops::LayerNormNPUKernel, - ops::LayerNormNPUKernel); -REGISTER_OP_NPU_KERNEL(layer_norm_grad, - ops::LayerNormGradNPUKernel, - ops::LayerNormGradNPUKernel); diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc deleted file mode 100644 index 4b9b96c23b0b7..0000000000000 --- a/paddle/fluid/operators/load_combine_op_npu.cc +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/load_combine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc deleted file mode 100644 index 0e8517fd7b529..0000000000000 --- a/paddle/fluid/operators/load_op_npu.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -namespace paddle { -namespace operators { -template -class LoadOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto place = ctx.GetPlace(); - // FIXME(yuyang18): We save variable to local file now, but we should change - // it to save an output stream. - auto filename = ctx.Attr("file_path"); - std::ifstream fin(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fin), - true, - platform::errors::Unavailable( - "Load operator fail to open file %s, please check " - "whether the model file is complete or damaged.", - filename)); - - auto out_var_name = ctx.OutputNames("Out").data(); - auto *out_var = ctx.OutputVar("Out"); - - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::InvalidArgument( - "The variable %s to be loaded cannot be found.", out_var_name)); - - if (out_var->IsType()) { - LoadLodTensor(fin, place, out_var, ctx); - } else if (out_var->IsType()) { - LoadSelectedRows(fin, place, out_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Load operator only supports loading phi::DenseTensor and " - "SelectedRows " - "variable, %s has wrong type", - out_var_name)); - } - } - - void LoadLodTensor(std::istream &fin, - const platform::Place &place, - framework::Variable *var, - const framework::ExecutionContext &ctx) const { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - auto *tensor = var->GetMutable(); - - auto seek = ctx.Attr("seek"); - - if (seek != -1) { - PADDLE_ENFORCE_GE(seek, - 0, - platform::errors::InvalidArgument( - "seek witn tensor must great than or equal to 0")); - auto shape = ctx.Attr>("shape"); - paddle::framework::DeserializeFromStream( - fin, tensor, dev_ctx, seek, shape); - } else { - paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx); - } - - auto load_as_fp16 = ctx.Attr("load_as_fp16"); - auto in_dtype = tensor->dtype(); - auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = - phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, in_dtype); - auto out_kernel_type = - phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, out_dtype); - phi::DenseTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType( - in_kernel_type, out_kernel_type, *tensor, &fp16_tensor); - - // reset output tensor - var->Clear(); - tensor = var->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); - } - } - - void LoadSelectedRows(std::istream &fin, - const platform::Place &place, - framework::Variable *var) const { - auto *selectedRows = var->GetMutable(); - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - load, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel); diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc deleted file mode 100644 index 0eb4ebe2442c1..0000000000000 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ /dev/null @@ -1,130 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -void LogLossAdds(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scale, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scale}}); - runner.Run(stream); -} - -template -void LogLossMuls(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - float scale, - phi::DenseTensor* y) { - // Calculate y = x + scale - y->mutable_data(x->dims(), place); - const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scale}}); - runner.Run(stream); -} - -template -void LogLossBCE(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - phi::DenseTensor* z) { - z->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("BinaryCrossEntropy", - {*x, *y}, - {*z}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); -} - -template -void LogLossBCEGrad(const platform::Place& place, - const aclrtStream& stream, - const phi::DenseTensor* x, - const phi::DenseTensor* y, - const phi::DenseTensor* dout, - phi::DenseTensor* dx) { - dx->mutable_data(x->dims(), place); - const auto& runner = - NpuOpRunner("BinaryCrossEntropyGrad", - {*x, *y, *dout}, - {*dx}, - {{"reduction", static_cast("none")}}); - runner.Run(stream); -} - -template -class LogLossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* y = ctx.Output("Loss"); - auto* pred = ctx.Input("Predicted"); - auto* label = ctx.Input("Labels"); - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - float factor = 1 / (1 + 2 * epsilon); - float coef = std::log(factor); - LogLossAdds(place, stream, pred, epsilon, y); - LogLossMuls(place, stream, y, factor, y); - LogLossBCE(place, stream, y, label, y); - LogLossAdds(place, stream, y, coef, y); - } -}; - -template -class LogLossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* pred = ctx.Input("Predicted"); - auto* label = ctx.Input("Labels"); - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = - ctx.Output(framework::GradVarName("Predicted")); - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - if (dpred) { - LogLossBCEGrad(place, stream, pred, label, dloss, dpred); - LogLossMuls(place, stream, dpred, 1 / (1 + 2 * epsilon), dpred); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(log_loss, ops::LogLossNPUKernel); - -REGISTER_OP_NPU_KERNEL(log_loss_grad, ops::LogLossGradNPUKernel); diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc deleted file mode 100644 index 34f9c11e066a7..0000000000000 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class LogSoftmaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Out = ctx.Output("Out"); - const int rank = X->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - Out->mutable_data(ctx.GetPlace()); - - if (X->numel() != 0) { - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner( - "LogSoftmaxV2", {*X}, {*Out}, {{"axes", std::vector{axis}}}); - runner.Run(stream); - } - } -}; - -template -class LogSoftmaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Out = ctx.Input("Out"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - const int rank = dOut->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(ctx.GetPlace()); - - if (dOut->numel() != 0) { - auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("LogSoftmaxGrad", - {*dOut, *Out}, - {*dX}, - {{"axis", std::vector{axis}}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(log_softmax, - ops::LogSoftmaxNPUKernel, - ops::LogSoftmaxNPUKernel); - -REGISTER_OP_NPU_KERNEL(log_softmax_grad, - ops::LogSoftmaxGradNPUKernel, - ops::LogSoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc deleted file mode 100644 index 8ae050541fb23..0000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -constexpr int64_t kNoPadding = -1; - -template -class LookupTableV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids_t = ctx.Input("Ids"); // int tensor - auto *output_t = ctx.Output("Out"); // float tensor - auto *table_t = ctx.Input("W"); - - auto *table_var = ctx.InputVar("W"); - PADDLE_ENFORCE_EQ( - table_var->IsType(), - true, - platform::errors::InvalidArgument("npu only accept phi::DenseTensor")); - output_t->mutable_data(ctx.GetPlace()); - - int64_t padding_idx = ctx.Attr("padding_idx"); - if (padding_idx == kNoPadding) { - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(*table_t) - .AddInput(*ids_t) - .AddInput(std::vector{0}) -#if (CANN_VERSION_CODE >= 503003) - .AddAttrs({{"batch_dims", 0}}) -#endif - .AddOutput(*output_t); - runner.Run(); - } else { - phi::DenseTensor tmp_table_t(table_t->type()); - tmp_table_t.mutable_data(table_t->dims(), ctx.GetPlace()); - - phi::DenseTensor index; - index.mutable_data({1, 1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&index, - static_cast(padding_idx)); - - auto updata_dim = phi::make_ddim({1, table_t->dims()[1]}); - phi::DenseTensor update; - update.mutable_data(updata_dim, ctx.GetPlace()); - FillNpuTensorWithConstant(&update, static_cast(0)); - update.Resize(updata_dim); - - NpuOpRunner update_runner; - update_runner.SetType("TensorScatterUpdate") - .AddInput(*table_t) - .AddInput(index) - .AddInput(update) - .AddOutput(tmp_table_t); - update_runner.Run(); - - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(tmp_table_t) - .AddInput(*ids_t) - .AddInput(std::vector{0}) -#if (CANN_VERSION_CODE >= 503003) - .AddAttrs({{"batch_dims", 0}}) -#endif - .AddOutput(*output_t); - runner.Run(); - } - } -}; - -template -class LookupTableV2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids_t = ctx.Input("Ids"); - auto *output_grad_t = - ctx.Input(framework::GradVarName("Out")); - auto *table_grad_t = - ctx.Output(framework::GradVarName("W")); - table_grad_t->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - int64_t padding_idx = ctx.Attr("padding_idx"); - - /* EmbeddingDenseGrad has bug on large shape, temporarily disable it. - - int embedding_dim = table_grad_t->dims()[1]; - if (embedding_dim % 32 == 0) { - // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in - // EmbeddingDenseGrad must be an integer multiple of 32. - int num_weights = table_grad_t->dims()[0]; - const auto &runner = - NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, - {*table_grad_t}, {{"num_weights", num_weights}, - {"padding_idx", -1}, - {"scale_grad_by_freq", false}}); - runner.Run(stream); - return; - } - */ - - const auto &runner_zeros = - NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); - runner_zeros.Run(stream); - - if (padding_idx == kNoPadding) { - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", - {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, - {{"use_locking", true}}); - runner_scatter.Run(stream); - } else { - phi::DenseTensor casted_ids_t; - if (framework::TransToProtoVarType(ids_t->dtype()) != - framework::proto::VarType::INT32) { - casted_ids_t.mutable_data(ids_t->dims(), ctx.GetPlace()); - const auto &cast_runner = NpuOpRunner( - "Cast", {*ids_t}, {casted_ids_t}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(stream); - } else { - casted_ids_t.ShareDataWith(*ids_t); - } - auto table_grad_dims = table_grad_t->dims(); - - NpuOpRunner runner; - runner.SetType("UnsortedSegmentSum") - .AddInput(*output_grad_t) - .AddInput(casted_ids_t) - .AddInput(std::vector{table_grad_dims[0]}) - .AddOutput(*table_grad_t); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - lookup_table_v2, - ops::LookupTableV2NPUKernel, - ops::LookupTableV2NPUKernel, - ops::LookupTableV2NPUKernel); - -REGISTER_OP_NPU_KERNEL( - lookup_table_v2_grad, - ops::LookupTableV2GradNPUKernel, - ops::LookupTableV2GradNPUKernel, - ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc deleted file mode 100644 index 96fba4b968869..0000000000000 --- a/paddle/fluid/operators/masked_select_op_npu.cc +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MaskedSelectedNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto mask = ctx.Input("Mask"); - auto out = ctx.Output("Y"); - - auto input_dim = input->dims(); - auto mask_dim = mask->dims(); - PADDLE_ENFORCE_EQ( - input_dim, - mask_dim, - platform::errors::InvalidArgument( - "The dim size of input and mask in OP(masked_selected) " - "must be equal, but got input dim:(%ld), mask dim: " - "(%ld). Please check input " - "value.", - input_dim, - mask_dim)); - - auto& dev_ctx = - ctx.template device_context(); - auto stream = dev_ctx.stream(); - - Tensor mask_int32, out_size; - std::vector out_size_vec; - mask_int32.mutable_data(mask->dims(), ctx.GetPlace()); - out_size.mutable_data({1}, ctx.GetPlace()); - { - const auto& cast_runner = NpuOpRunner( - "Cast", - {*mask}, - {mask_int32}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::proto::VarType::INT32))}}); - cast_runner.Run(stream); - - mask_int32.Resize({mask_int32.numel()}); - NpuOpRunner sum_runner; - sum_runner.SetType("ReduceSum"); - sum_runner.AddInput(mask_int32); - sum_runner.AddInput(std::vector({0})); - sum_runner.AddOutput(out_size); - sum_runner.AddAttr("keep_dims", false); - sum_runner.Run(stream); - paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); - } - - out->Resize({out_size_vec[0]}); - out->mutable_data(ctx.GetPlace()); - - Tensor topkv2_out, indices; - topkv2_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - indices.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - { - NpuOpRunner topkv2_runner; - topkv2_runner.SetType("TopKV2") - .AddInput(mask_int32) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", false) - .AddAttr("dim", 0) - .AddAttr("largest", true) - .Run(stream); - // TopKV2 may be unstable - NpuOpRunner topkv2_runner2; - topkv2_runner2.SetType("TopKV2") - .AddInput(indices) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", true) - .AddAttr("dim", 0) - .AddAttr("largest", false) - .Run(stream); - - Tensor input_tmp; - input_tmp.ShareDataWith(*input); - input_tmp.Resize({input->numel()}); - const auto& gather_runner = NpuOpRunner( - "GatherV2D", {input_tmp, topkv2_out}, {*out}, {{"axis", 0}}); - gather_runner.Run(stream); - } - } -}; - -template -class MaskedSelectedGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto mask = ctx.Input("Mask"); - auto y_grad = ctx.Input(framework::GradVarName("Y")); - auto x_grad = ctx.Output(framework::GradVarName("X")); - - x_grad->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = - ctx.template device_context(); - auto stream = dev_ctx.stream(); - - Tensor mask_int32, out_size; - std::vector out_size_vec; - mask_int32.mutable_data(mask->dims(), ctx.GetPlace()); - out_size.mutable_data({1}, ctx.GetPlace()); - { - const auto& cast_runner = NpuOpRunner( - "Cast", - {*mask}, - {mask_int32}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::proto::VarType::INT32))}}); - cast_runner.Run(stream); - - mask_int32.Resize({mask_int32.numel()}); - NpuOpRunner sum_runner; - sum_runner.SetType("ReduceSum"); - sum_runner.AddInput(mask_int32); - sum_runner.AddInput(std::vector({0})); - sum_runner.AddOutput(out_size); - sum_runner.AddAttr("keep_dims", false); - sum_runner.Run(stream); - paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); - } - - Tensor topkv2_out, indices; - topkv2_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - indices.mutable_data({out_size_vec[0]}, ctx.GetPlace()); - { - NpuOpRunner topkv2_runner; - topkv2_runner.SetType("TopKV2") - .AddInput(mask_int32) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", false) - .AddAttr("dim", 0) - .AddAttr("largest", true) - .Run(stream); - - NpuOpRunner topkv2_runner2; - topkv2_runner2.SetType("TopKV2") - .AddInput(indices) - .AddInput(out_size) - .AddOutput(topkv2_out) - .AddOutput(indices) - .AddAttr("sorted", true) - .AddAttr("dim", 0) - .AddAttr("largest", false) - .Run(stream); - - topkv2_out.Resize({out_size_vec[0], 1}); - x_grad->Resize({x_grad->numel()}); - NpuOpRunner scatter_runner; - scatter_runner.SetType("ScatterNd"); - scatter_runner.AddInput(topkv2_out); - scatter_runner.AddInput(*y_grad); - scatter_runner.AddInput( - std::vector({static_cast(x_grad->numel())})); - scatter_runner.AddOutput(*x_grad); - scatter_runner.Run(stream); - x_grad->Resize(mask->dims()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(masked_select, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel, - ops::MaskedSelectedNPUKernel); -REGISTER_OP_NPU_KERNEL(masked_select_grad, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel, - ops::MaskedSelectedGradNPUKernel); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 61cc7dc9f4b64..c52393c3e05ad 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -20,9 +20,8 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(sequence_pooling DEPS math_function jit_kernel_helper) -if(WITH_ASCEND_CL) - math_library(beam_search DEPS math_function beam_search_npu) -elseif(WITH_XPU) + +if(WITH_XPU) math_library(beam_search DEPS math_function beam_search_xpu) else() math_library(beam_search DEPS math_function) diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc deleted file mode 100644 index 937cd46d52888..0000000000000 --- a/paddle/fluid/operators/math/beam_search_npu.cc +++ /dev/null @@ -1,588 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/beam_search.h" -#include "paddle/phi/common/data_type.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class NPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class BeamSearchFunctor { - public: - void operator()(const platform::NPUDeviceContext& ctx, - const phi::DenseTensor* pre_ids, - const phi::DenseTensor* pre_scores, - const phi::DenseTensor* ids, - const phi::DenseTensor* scores, - phi::DenseTensor* selected_ids, - phi::DenseTensor* selected_scores, - phi::DenseTensor* parent_idx, - size_t level, - size_t beam_size, - int end_id, - bool is_accumulated) { - auto abs_lod = framework::ToAbsOffset(scores->lod()); - auto& high_level = abs_lod[level]; - - int64_t num_seqs = scores->NumElements(level); - // size of the first beam is 1, others are equal to beam_size - int64_t real_beam_size = static_cast(scores->dims()[0] / num_seqs); - // K - int64_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { - seq_width *= scores->dims()[i]; - } - - auto place = ctx.GetPlace(); - auto stream = ctx.stream(); - - int64_t total_length = num_seqs * beam_size; - int64_t batch_size = static_cast(scores->dims()[0]); - selected_ids->mutable_data(phi::make_ddim({total_length, 1}), - place); - selected_scores->mutable_data(phi::make_ddim({total_length, 1}), - place); - parent_idx->mutable_data(phi::make_ddim({total_length}), place); - - // Step1: Define Tensors and Preprocess the situation that pre_id == end_id - - // cast ids and pre_ids from int to float32 - Tensor ids_int32(phi::DataType::INT32); - if (framework::TransToProtoVarType(ids->dtype()) != - framework::proto::VarType::INT32) { - ids_int32.Resize(ids->dims()); - ids_int32.mutable_data(ctx.GetPlace()); - auto dst_dtype_ids_int32 = - ConvertToNpuDtype(framework::TransToProtoVarType(ids_int32.dtype())); - const auto& runner_ids_int32 = - NpuOpRunner("Cast", - {*ids}, - {ids_int32}, - {{"dst_type", static_cast(dst_dtype_ids_int32)}}); - runner_ids_int32.Run(stream); - } else { - ids_int32.ShareDataWith(*ids); - } - - Tensor pre_ids_int32(phi::DataType::INT32); - if (framework::TransToProtoVarType(pre_ids->dtype()) != - framework::proto::VarType::INT32) { - pre_ids_int32.Resize(pre_ids->dims()); - pre_ids_int32.mutable_data(ctx.GetPlace()); - auto dst_dtype_pre_ids_int32 = ConvertToNpuDtype( - framework::TransToProtoVarType(pre_ids_int32.dtype())); - const auto& runner_pre_ids_int32 = NpuOpRunner( - "Cast", - {*pre_ids}, - {pre_ids_int32}, - {{"dst_type", static_cast(dst_dtype_pre_ids_int32)}}); - runner_pre_ids_int32.Run(stream); - } else { - pre_ids_int32.ShareDataWith(*pre_ids); - } - - Tensor expand_pre_ids(pre_ids_int32.dtype()); - expand_pre_ids.Resize(phi::make_ddim({batch_size, seq_width})); - expand_pre_ids.mutable_data(place); - const auto& runner_tile_pre_ids = - NpuOpRunner("TileWithAxis", - {pre_ids_int32}, - {expand_pre_ids}, - {{"axis", 1}, {"tiles", seq_width}}); - runner_tile_pre_ids.Run(stream); - expand_pre_ids.Resize(ids_int32.dims()); - - Tensor expand_pre_scores(pre_scores->dtype()); - expand_pre_scores.Resize(phi::make_ddim({batch_size, seq_width})); - expand_pre_scores.mutable_data(place); - const auto& runner_tile_pre_scores = - NpuOpRunner("TileWithAxis", - {*pre_scores}, - {expand_pre_scores}, - {{"axis", 1}, {"tiles", seq_width}}); - runner_tile_pre_scores.Run(stream); - expand_pre_scores.Resize(scores->dims()); - - // End_id Tensors - Tensor end_id_tmp_tensor(phi::DataType::INT32); - end_id_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&end_id_tmp_tensor, end_id); - - Tensor end_id_tensors(ids_int32.dtype()); - end_id_tensors.mutable_data(ids_int32.dims(), place); - const auto& runner_fill_end_id = - NpuOpRunner("FillD", - {end_id_tmp_tensor}, - {end_id_tensors}, - {{"dims", phi::vectorize(ids_int32.dims())}}); - runner_fill_end_id.Run(stream); - - // whether expand_pre_ids == end_ids? - Tensor equal_end_ids(phi::DataType::BOOL); - equal_end_ids.mutable_data(ids_int32.dims(), place); - const auto& runner_equal_end_ids = NpuOpRunner( - "Equal", {expand_pre_ids, end_id_tensors}, {equal_end_ids}, {}); - runner_equal_end_ids.Run(stream); - - // construct a Tensor with dimension ids->dims(): - // [[False, True, True, True, ...], - // [False, True, True, True, ...], - // ...] - Tensor false_tmp_tensor(phi::DataType::INT32); - false_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&false_tmp_tensor, static_cast(false)); - - Tensor first_pos_false_tensors(phi::DataType::INT32); - first_pos_false_tensors.Resize(phi::make_ddim({batch_size, 1})); - first_pos_false_tensors.mutable_data(place); - std::vector fill_dims = {batch_size, 1}; - framework::NPUAttributeMap fill_attr = {{"dims", fill_dims}}; - const auto& runner_fill_false_tensors = NpuOpRunner( - "FillD", {false_tmp_tensor}, {first_pos_false_tensors}, fill_attr); - runner_fill_false_tensors.Run(stream); - - Tensor pos_tensors(phi::DataType::INT32); - if (seq_width > 1) { - pos_tensors.Resize(phi::make_ddim({batch_size, seq_width})); - pos_tensors.mutable_data(place); - - Tensor true_tmp_tensor(phi::DataType::INT32); - true_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&true_tmp_tensor, static_cast(true)); - - Tensor second_pos_true_tensors(phi::DataType::INT32); - second_pos_true_tensors.Resize( - phi::make_ddim({batch_size, seq_width - 1})); - second_pos_true_tensors.mutable_data(place); - std::vector fill_dims2 = {batch_size, seq_width - 1}; - framework::NPUAttributeMap fill_attr2 = {{"dims", fill_dims2}}; - const auto& runner_fill_true_tensors = NpuOpRunner( - "FillD", {true_tmp_tensor}, {second_pos_true_tensors}, fill_attr2); - runner_fill_true_tensors.Run(stream); - - std::vector concat_inputs = {first_pos_false_tensors, - second_pos_true_tensors}; - std::vector concat_names = {"x0", "x1"}; - NpuOpRunner runner_concat_false_true{"ConcatD", - {concat_inputs}, - {pos_tensors}, - {{"concat_dim", 1}, {"N", 2}}}; - runner_concat_false_true.AddInputNames(concat_names); - runner_concat_false_true.Run(stream); - pos_tensors.Resize(ids_int32.dims()); - } else { - pos_tensors.ShareDataWith(first_pos_false_tensors); - } - - Tensor cast_pos_tensors_bool(phi::DataType::BOOL); - cast_pos_tensors_bool.Resize(pos_tensors.dims()); - cast_pos_tensors_bool.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_pos_tensors_bool.type())); - const auto& runner_cast_pos_tensors = - NpuOpRunner("Cast", - {pos_tensors}, - {cast_pos_tensors_bool}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_pos_tensors.Run(stream); - - // if pre_ids == end_ids, save only one score, and others become -inf - // construct pre_ids == end_ids and save only one score - Tensor save_one_end_score(phi::DataType::BOOL); - save_one_end_score.mutable_data(ids_int32.dims(), place); - const auto& runner_logical_and = - NpuOpRunner("LogicalAnd", - {equal_end_ids, cast_pos_tensors_bool}, - {save_one_end_score}, - {}); - runner_logical_and.Run(stream); - - // if save_one_end_score is True, set score to -inf - // define -Inf Tensors - Tensor ninf_tmp_tensor(scores->dtype()); - ninf_tmp_tensor.mutable_data({1}, ctx.GetPlace()); - float ninf_value = - static_cast(-std::numeric_limits::infinity()); - FillNpuTensorWithConstant(&ninf_tmp_tensor, ninf_value); - - Tensor ninf_tensors(scores->dtype()); - ninf_tensors.mutable_data(scores->dims(), place); - const auto& runner_fill_ninf = - NpuOpRunner("FillD", - {ninf_tmp_tensor}, - {ninf_tensors}, - {{"dims", phi::vectorize(scores->dims())}}); - runner_fill_ninf.Run(stream); - - // Step2: calculate topk scores - - // get scores used in topk op - Tensor tmp_scores(scores->dtype()); - tmp_scores.mutable_data(scores->dims(), place); - if (!is_accumulated) { - // if pre_id == end_id, cal_scores = pre_score, and id = end_id - // else, cal_score = pre_score + log(score) - - // calculate log(scores) - Tensor log_scores(scores->dtype()); - log_scores.mutable_data(scores->dims(), place); - - Tensor one(scores->dtype()); - one.mutable_data(scores->dims(), place); - const auto& runner_one = NpuOpRunner("OnesLike", {*scores}, {one}, {}); - runner_one.Run(stream); - - Tensor sub(scores->dtype()); - sub.mutable_data(scores->dims(), place); - const auto& runner_sub = NpuOpRunner("Sub", {*scores, one}, {sub}, {}); - runner_sub.Run(stream); - - const auto& runner_log_scores = - NpuOpRunner("Log1p", {sub}, {log_scores}, {}); - runner_log_scores.Run(stream); - - // tmp_scores = pre_score + log(scores) - const auto& runner_add_scores = - NpuOpRunner("Add", {log_scores, *pre_scores}, {tmp_scores}, {}); - runner_add_scores.Run(stream); - - // if pre_ids == end_ids, use pre_score rather than score - const auto& runner_select_equal_end_score = - NpuOpRunner("Select", - {equal_end_ids, expand_pre_scores, tmp_scores}, - {tmp_scores}, - {}); - runner_select_equal_end_score.Run(stream); - } else { - // if pre_ids == end_ids, use pre_score rather than score - const auto& runner_select_equal_end_score2 = - NpuOpRunner("Select", - {equal_end_ids, expand_pre_scores, *scores}, - {tmp_scores}, - {}); - runner_select_equal_end_score2.Run(stream); - } - - // if pre_ids == end_ids, save only one score, and others become -inf - Tensor cal_scores(scores->dtype()); - cal_scores.mutable_data(scores->dims(), place); - const auto& runner_select_inf_score = - NpuOpRunner("Select", - {save_one_end_score, ninf_tensors, tmp_scores}, - {cal_scores}, - {}); - runner_select_inf_score.Run(stream); - - // resize scores from [num_seqs * beam_size, K] to [num_seqs, beam_size * K] - // real_beam_size = 1 or beam_size - cal_scores.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width})); - - Tensor topk_scores(scores->dtype()); - topk_scores.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - topk_scores.mutable_data(ctx.GetPlace()); - - Tensor tmp_indices(phi::DataType::INT32); - tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - tmp_indices.mutable_data(ctx.GetPlace()); - - // run topk op - NpuOpRunner runner_topk; - runner_topk.SetType("TopKV2") - .AddInput(cal_scores) - .AddInput(std::vector{static_cast(beam_size)}) - .AddOutput(topk_scores) - .AddOutput(tmp_indices) - .AddAttr("sorted", true) - .AddAttr("dim", -1) - .AddAttr("largest", true); - runner_topk.Run(stream); - - // cast tmp_indices from int to float32 for Sort op - Tensor cast_tmp_indices(phi::DataType::FLOAT32); - cast_tmp_indices.Resize(tmp_indices.dims()); - cast_tmp_indices.mutable_data(ctx.GetPlace()); - auto dst_dtype_tmp_indices_fp32 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_tmp_indices.type())); - const auto& runner_cast_tmp_indices = NpuOpRunner( - "Cast", - {tmp_indices}, - {cast_tmp_indices}, - {{"dst_type", static_cast(dst_dtype_tmp_indices_fp32)}}); - runner_cast_tmp_indices.Run(stream); - - // sort tmp_indices - Tensor sorted_tmp_indices(phi::DataType::FLOAT32); - sorted_tmp_indices.Resize(tmp_indices.dims()); - sorted_tmp_indices.mutable_data(ctx.GetPlace()); - Tensor sorted_score_indices(phi::DataType::INT32); - sorted_score_indices.Resize(tmp_indices.dims()); - sorted_score_indices.mutable_data(ctx.GetPlace()); - const auto& runner_sort_tmp_indices = - NpuOpRunner("Sort", - {cast_tmp_indices}, - {sorted_tmp_indices, sorted_score_indices}, - {{"axis", 1}, {"descending", false}}); - runner_sort_tmp_indices.Run(stream); - - // cast sorted_tmp_indices from float32 to int - Tensor cast_sort_tmp_indices(phi::DataType::INT32); - cast_sort_tmp_indices.Resize(sorted_tmp_indices.dims()); - cast_sort_tmp_indices.mutable_data(ctx.GetPlace()); - auto dst_dtype_tmp_indices_int32 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_sort_tmp_indices.type())); - const auto& runner_cast_sort_tmp_indices = NpuOpRunner( - "Cast", - {sorted_tmp_indices}, - {cast_sort_tmp_indices}, - {{"dst_type", static_cast(dst_dtype_tmp_indices_int32)}}); - runner_cast_sort_tmp_indices.Run(stream); - - // Step 3: infer selected ids from tmp_indices and ids - - // if pre_ids == end_ids, use pre_ids rather than ids - Tensor cal_ids(ids_int32.dtype()); - cal_ids.mutable_data(ids_int32.dims(), place); - const auto& runner_select_equal_end_id = NpuOpRunner( - "Select", {equal_end_ids, expand_pre_ids, ids_int32}, {cal_ids}, {}); - runner_select_equal_end_id.Run(stream); - - // resize ids from [num_seqs * real_beam_size, K] to [num_seqs, - // real_beam_size * K] - // real_beam_size = 1 or beam_size - cal_ids.Resize(phi::make_ddim({num_seqs, real_beam_size * seq_width})); - - // construct batch_ids like [[0, 0, 0], [1, 1, 1], ..., [bs-1, bs-1, bs-1]] - // construct arange(num_seqs*beam_size).reshape((num_seqs, beam_size)) // - // beam_size - Tensor batch_ids(phi::DataType::INT32); - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - batch_ids.mutable_data(place); - - std::vector vector_batch_ids; - for (int i = 0; i < num_seqs * static_cast(beam_size); ++i) { - vector_batch_ids.push_back(static_cast(i / beam_size)); - } - framework::TensorFromVector(vector_batch_ids, ctx, &batch_ids); - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - - // sort topk_scores to get selected_scores - // get indices of gather_nd op for calculating selected_scores - Tensor gather_nd_score_indices(phi::DataType::INT32); - gather_nd_score_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 2})); - gather_nd_score_indices.mutable_data(place); - - sorted_score_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - std::vector concat_inputs2 = {batch_ids, - sorted_score_indices}; - std::vector concat_names = {"x0", "x1"}; - NpuOpRunner runner_concat_score_indices{"ConcatD", - {concat_inputs2}, - {gather_nd_score_indices}, - {{"concat_dim", 2}, {"N", 2}}}; - runner_concat_score_indices.AddInputNames(concat_names); - runner_concat_score_indices.Run(stream); - - // use gather_nd to get selected_scores - const auto& runner_gather_nd_scores = - NpuOpRunner("GatherNd", - {topk_scores, gather_nd_score_indices}, - {*selected_scores}, - {}); - runner_gather_nd_scores.Run(stream); - - // get indices of gather_nd op - cast_sort_tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 1})); - Tensor gather_nd_id_indices(phi::DataType::INT32); - gather_nd_id_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size), 2})); - gather_nd_id_indices.mutable_data(place); - - std::vector concat_inputs3 = {batch_ids, - cast_sort_tmp_indices}; - NpuOpRunner runner_concat_id_indices{"ConcatD", - {concat_inputs3}, - {gather_nd_id_indices}, - {{"concat_dim", 2}, {"N", 2}}}; - runner_concat_id_indices.AddInputNames(concat_names); - runner_concat_id_indices.Run(stream); - - // use gather_nd to get selected_ids - Tensor topk_ids(phi::DataType::INT32); - topk_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - topk_ids.mutable_data(ctx.GetPlace()); - - const auto& runner_gather_nd_ids = NpuOpRunner( - "GatherNd", {cal_ids, gather_nd_id_indices}, {topk_ids}, {}); - runner_gather_nd_ids.Run(stream); - - // cast topk_ids from int to int64 to get selected_ids - auto dst_dtype_selected_ids = - ConvertToNpuDtype(framework::TransToProtoVarType(selected_ids->type())); - const auto& runner_cast_selected_ids = - NpuOpRunner("Cast", - {topk_ids}, - {*selected_ids}, - {{"dst_type", static_cast(dst_dtype_selected_ids)}}); - runner_cast_selected_ids.Run(stream); - - // TODO(pangyoki): PruneEndBeams - - // Step 4: set lod of output Tensor - // define Tensor with value `seq_width` - Tensor seq_width_tensor(phi::DataType::INT32); - seq_width_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&seq_width_tensor, - static_cast(seq_width)); - - // beam_ids = tmp_indices // seq_width - Tensor beam_ids(phi::DataType::INT32); - beam_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - beam_ids.mutable_data(ctx.GetPlace()); - cast_sort_tmp_indices.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - - const auto& runner_div = NpuOpRunner( - "Div", {cast_sort_tmp_indices, seq_width_tensor}, {beam_ids}, {}); - runner_div.Run(stream); - - // get parent_idx by adding batch_ids to beam_ids - // construct scale_batch_ids like [[0, 0, 0], [bw, bw, bw], ..., [bs-1*bw, - // bs-1*bw, bs-1*bw]] - batch_ids.Resize( - phi::make_ddim({num_seqs, static_cast(beam_size)})); - - // cast batch_ids from int to float32 - Tensor cast_batch_ids(phi::DataType::FLOAT32); - cast_batch_ids.Resize(batch_ids.dims()); - cast_batch_ids.mutable_data(ctx.GetPlace()); - auto dst_dtype1 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_batch_ids.type())); - const auto& runner_cast_batch_ids = - NpuOpRunner("Cast", - {batch_ids}, - {cast_batch_ids}, - {{"dst_type", static_cast(dst_dtype1)}}); - runner_cast_batch_ids.Run(stream); - - // scale batch_ids with beam_size - Tensor scale_batch_ids(phi::DataType::FLOAT32); - scale_batch_ids.Resize(batch_ids.dims()); - scale_batch_ids.mutable_data(place); - const auto& runner_power = - NpuOpRunner("Power", - {cast_batch_ids}, - {scale_batch_ids}, - {{"power", static_cast(1.0)}, - {"scale", static_cast(beam_size)}, - {"shift", static_cast(0.0)}}); - runner_power.Run(stream); - - // cast cast_scale_batch_ids from float32 to int - Tensor cast_scale_batch_ids(phi::DataType::INT32); - cast_scale_batch_ids.Resize(scale_batch_ids.dims()); - cast_scale_batch_ids.mutable_data(ctx.GetPlace()); - auto dst_dtype2 = ConvertToNpuDtype( - framework::TransToProtoVarType(cast_scale_batch_ids.type())); - const auto& runner_cast_scale_batch_ids = - NpuOpRunner("Cast", - {scale_batch_ids}, - {cast_scale_batch_ids}, - {{"dst_type", static_cast(dst_dtype2)}}); - runner_cast_scale_batch_ids.Run(stream); - - // calculate parent_idx - Tensor tmp_parent_idx(phi::DataType::INT32); - tmp_parent_idx.Resize(parent_idx->dims()); - tmp_parent_idx.mutable_data(place); - const auto& runner_add_beam_id = NpuOpRunner( - "Add", {beam_ids, cast_scale_batch_ids}, {tmp_parent_idx}, {}); - runner_add_beam_id.Run(stream); - - // cast tmp_parent_idx from int to int64 to get parent_idx - auto dst_dtype_parent_idx = - ConvertToNpuDtype(framework::TransToProtoVarType(parent_idx->type())); - const auto& runner_cast_parent_idx = - NpuOpRunner("Cast", - {tmp_parent_idx}, - {*parent_idx}, - {{"dst_type", static_cast(dst_dtype_parent_idx)}}); - runner_cast_parent_idx.Run(stream); - - std::vector vector_parent_idx; - framework::TensorToVector(tmp_parent_idx, ctx, &vector_parent_idx); - - // set low level, len(low_level) = high_level[-1] - std::vector low_level; - std::vector num_parent_ids(num_seqs * beam_size, - static_cast(0)); - size_t low_level_size = high_level[num_seqs]; - size_t sum_parent_id = 0; - - // calculate number of every parent_id - for (size_t i = 0; i < num_seqs * beam_size; ++i) { - num_parent_ids[vector_parent_idx[i]]++; - } - - // update low_level - low_level.push_back(0); - for (size_t i = 0; i < low_level_size; ++i) { - sum_parent_id += num_parent_ids[i]; - low_level.push_back(sum_parent_id); - } - - // fill lod - framework::LoD lod(2); - lod[0].assign(high_level.begin(), high_level.end()); - lod[1].assign(low_level.begin(), low_level.end()); - if (!framework::CheckLoD(lod)) { - PADDLE_THROW(platform::errors::InvalidArgument( - "lod %s is not right in" - " beam_search, please check your code.", - framework::LoDToString(lod))); - } - selected_ids->set_lod(lod); - selected_scores->set_lod(lod); - } -}; - -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc deleted file mode 100644 index d49d9a319ccff..0000000000000 --- a/paddle/fluid/operators/matmul_op_npu.cc +++ /dev/null @@ -1,561 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void Mul(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {}); - runner_dx.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {}); - runner_dx.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void Dot(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void MatMul2D(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = - NpuOpRunner("MatMul", - {X, Y}, - {*Out}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = - NpuOpRunner("MatMul", - {X, Y}, - {Out_temp}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y, - const float alpha) { - Out->mutable_data(ctx.GetPlace()); - - if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { - const auto& runner = - NpuOpRunner("BatchMatMul", - {X, Y}, - {*Out}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); - } else { - phi::DenseTensor Out_temp(Out->dtype()); - Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); - const auto& out_temp_runner = - NpuOpRunner("BatchMatMul", - {X, Y}, - {Out_temp}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - out_temp_runner.Run(stream); - - const auto& runner = - NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); - runner.Run(stream); - } -} - -template -static void ReduceDims(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const std::vector& dims, - const std::vector& brd_dims, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - std::vector axes; - int64_t size = brd_dims.size(); - int64_t diff = brd_dims.size() - dims.size(); - for (int64_t i = 0; i < size; ++i) { - if (i < diff) { - axes.push_back(i); - continue; - } - if (brd_dims[i] > dims[i - diff]) { - axes.push_back(i); - } - } - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); -} - -template -class MatMulNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - float alpha = static_cast(ctx.Attr("alpha")); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(Out->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - PADDLE_ENFORCE_EQ( - X->numel(), - Y->numel(), - platform::errors::InvalidArgument( - "X's numbers must be equal to Y's numbers," - "when X/Y's dims =1. But received X has [%d] elements," - "received Y has [%d] elements", - X->numel(), - Y->numel())); - Out->Resize({1}); - Dot(ctx, stream, *X, *Y, Out, alpha); - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - y_ndim = 2; - out_ndim += 1; - } - - const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - if (transpose_y) { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 1], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 1, - K, - y_ndim - 1, - y_dims[y_ndim - 1])); - } else { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 2], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 2, - K, - y_ndim - 2, - y_dims[y_ndim - 2])); - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (x_ndim == 2 && y_ndim == 2) { - MatMul2D( - ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); - return; - } - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (transpose_x == false && y_ndim == 2) { - std::vector vec_dim = {x_temp.numel() / K, K}; - x_temp.Resize(phi::make_ddim(vec_dim)); - MatMul2D( - ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->dtype()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->dtype()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - MatMulND(ctx, - stream, - x_temp_brd, - y_temp_brd, - Out, - transpose_x, - transpose_y, - alpha); - } -}; - -template -class MatMulGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - bool transpose_x = ctx.Attr("transpose_X"); - bool transpose_y = ctx.Attr("transpose_Y"); - float alpha = static_cast(ctx.Attr("alpha")); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(dOut->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - phi::DenseTensor dout_temp(dOut->dtype()); - dout_temp.Resize(X->dims()); - dout_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(*dOut) - .AddInput(std::move(x_dims)) - .AddOutput(dout_temp) - .Run(stream); - - if (dX) { - Mul(ctx, stream, dout_temp, *Y, dX, alpha); - } - if (dY) { - Mul(ctx, stream, dout_temp, *X, dY, alpha); - } - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp, dout_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - dout_temp.ShareDataWith(*dOut); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - y_ndim = 2; - out_ndim += 1; - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (out_ndim == 2) { - if (dX) { - dX->Resize(phi::make_ddim(x_dims)); - if (transpose_x) { - MatMul2D( - ctx, stream, y_temp, dout_temp, dX, transpose_y, true, alpha); - } else { - MatMul2D( - ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha); - } - dX->Resize(X->dims()); - } - if (dY) { - dY->Resize(phi::make_ddim(y_dims)); - if (transpose_y) { - MatMul2D( - ctx, stream, dout_temp, x_temp, dY, true, transpose_x, alpha); - } else { - MatMul2D( - ctx, stream, x_temp, dout_temp, dY, !transpose_x, false, alpha); - } - dY->Resize(Y->dims()); - } - return; - } - - const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (transpose_x == false && y_ndim == 2) { - std::vector x_vec_dim = {x_temp.numel() / K, K}; - dout_temp.Resize( - phi::make_ddim(std::vector{dout_temp.numel() / N, N})); - if (dX) { - dX->Resize(phi::make_ddim(x_vec_dim)); - MatMul2D( - ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha); - dX->Resize(X->dims()); - } - if (dY) { - x_temp.Resize(phi::make_ddim(x_vec_dim)); - if (transpose_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false, alpha); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false, alpha); - } - } - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->dtype()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->dtype()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - - if (dX) { - if (x_dims == x_broadcast_dims) { - if (transpose_x) { - MatMulND( - ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true, alpha); - } else { - MatMulND(ctx, - stream, - dout_temp, - y_temp_brd, - dX, - false, - !transpose_y, - alpha); - } - } else { - phi::DenseTensor dx_temp(X->dtype()); - dx_temp.Resize(phi::make_ddim(x_broadcast_dims)); - if (transpose_x) { - MatMulND(ctx, - stream, - y_temp_brd, - dout_temp, - &dx_temp, - transpose_y, - true, - alpha); - } else { - MatMulND(ctx, - stream, - dout_temp, - y_temp_brd, - &dx_temp, - false, - !transpose_y, - alpha); - } - ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); - } - } - if (dY) { - if (y_dims == y_broadcast_dims) { - if (transpose_y) { - MatMulND( - ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x, alpha); - } else { - MatMulND(ctx, - stream, - x_temp_brd, - dout_temp, - dY, - !transpose_x, - false, - alpha); - } - } else { - phi::DenseTensor dy_temp(Y->dtype()); - dy_temp.Resize(phi::make_ddim(y_broadcast_dims)); - if (transpose_y) { - MatMulND(ctx, - stream, - dout_temp, - x_temp_brd, - &dy_temp, - true, - transpose_x, - alpha); - } else { - MatMulND(ctx, - stream, - x_temp_brd, - dout_temp, - &dy_temp, - !transpose_x, - false, - alpha); - } - ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - matmul, - ops::MatMulNPUKernel, - ops::MatMulNPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_grad, - ops::MatMulGradNPUKernel, - ops::MatMulGradNPUKernel); diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc deleted file mode 100644 index 2a398fbb5499b..0000000000000 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ /dev/null @@ -1,480 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/matmul_v2_op.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -static void MatMul2D(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("MatMul", - {X, Y}, - {*Out}, - {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); - runner.Run(stream); -} - -template -static void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("BatchMatMul", - {X, Y}, - {*Out}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); -} - -#if (CANN_VERSION_CODE < 504000) -template <> -void MatMulND(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const phi::DenseTensor& X, - const phi::DenseTensor& Y, - phi::DenseTensor* Out, - const bool trans_x, - const bool trans_y) { - Out->mutable_data(ctx.GetPlace()); - phi::DenseTensor x_fp32, y_fp32, out_fp32; - x_fp32.Resize(X.dims()); - y_fp32.Resize(Y.dims()); - out_fp32.Resize(Out->dims()); - x_fp32.mutable_data(ctx.GetPlace()); - y_fp32.mutable_data(ctx.GetPlace()); - out_fp32.mutable_data(ctx.GetPlace()); - - const auto& cast_x = - NpuOpRunner("Cast", - {X}, - {x_fp32}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(x_fp32.type())))}}); - cast_x.Run(stream); - const auto& cast_y = - NpuOpRunner("Cast", - {Y}, - {y_fp32}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(y_fp32.type())))}}); - cast_y.Run(stream); - - const auto& runner = NpuOpRunner("BatchMatMul", - {x_fp32, y_fp32}, - {out_fp32}, - {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); - runner.Run(stream); - - const auto& cast_out = NpuOpRunner( - "Cast", - {out_fp32}, - {*Out}, - {{"dst_type", - static_cast( - ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}}); - cast_out.Run(stream); -} -#endif - -template -static void ReduceDims(const framework::ExecutionContext& ctx, - const aclrtStream& stream, - const std::vector& dims, - const std::vector& brd_dims, - const phi::DenseTensor& in, - phi::DenseTensor* out) { - std::vector axes; - int64_t size = brd_dims.size(); - int64_t diff = brd_dims.size() - dims.size(); - for (int64_t i = 0; i < size; ++i) { - if (i < diff) { - axes.push_back(i); - continue; - } - if (brd_dims[i] > dims[i - diff]) { - axes.push_back(i); - } - } - out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner( - "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); -} - -template -class MatMulV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); - const bool trans_x = ctx.Attr("trans_x"); - const bool trans_y = ctx.Attr("trans_y"); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(Out->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - PADDLE_ENFORCE_EQ( - X->numel(), - Y->numel(), - platform::errors::InvalidArgument( - "X's numbers must be equal to Y's numbers," - "when X/Y's dims =1. But received X has [%d] elements," - "received Y has [%d] elements", - X->numel(), - Y->numel())); - Out->Resize({1}); - Out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); - runner.Run(stream); - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - y_ndim = 2; - out_ndim += 1; - } - - const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - if (trans_y) { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 1], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 1, - K, - y_ndim - 1, - y_dims[y_ndim - 1])); - } else { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 2], - K, - platform::errors::InvalidArgument("Input(Y) has error dim." - "Y'dims[%d] must be equal to %d" - "But received Y'dims[%d] is %d", - y_ndim - 2, - K, - y_ndim - 2, - y_dims[y_ndim - 2])); - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (x_ndim == 2 && y_ndim == 2) { - MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); - return; - } - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (trans_x == false && y_ndim == 2) { - std::vector vec_dim = {x_temp.numel() / K, K}; - x_temp.Resize(phi::make_ddim(vec_dim)); - MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->type()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->type()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); - } -}; - -template -class MatMulV2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dY = ctx.Output(framework::GradVarName("Y")); - const bool trans_x = ctx.Attr("trans_x"); - const bool trans_y = ctx.Attr("trans_y"); - - std::vector x_dims = phi::vectorize(X->dims()); - std::vector y_dims = phi::vectorize(Y->dims()); - std::vector out_dims = phi::vectorize(dOut->dims()); - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - int out_ndim = out_dims.size(); - - auto stream = ctx.template device_context().stream(); - - // Case 1: [K] x [K] = [1] - if (x_ndim == 1 && y_ndim == 1) { - phi::DenseTensor dout_temp(dOut->type()); - dout_temp.Resize(X->dims()); - dout_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(*dOut) - .AddInput(std::move(x_dims)) - .AddOutput(dout_temp) - .Run(stream); - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); - runner_dx.Run(stream); - } - if (dY) { - dY->mutable_data(ctx.GetPlace()); - const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); - runner_dy.Run(stream); - } - return; - } - - // Resize dim 1 to 2 - phi::DenseTensor x_temp, y_temp, dout_temp; - x_temp.ShareDataWith(*X); - y_temp.ShareDataWith(*Y); - dout_temp.ShareDataWith(*dOut); - if (x_ndim == 1) { - x_dims.insert(x_dims.begin(), 1); - out_dims.insert(out_dims.end() - 1, 1); - x_temp.Resize(phi::make_ddim(x_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - x_ndim = 2; - out_ndim += 1; - } - if (y_ndim == 1) { - y_dims.push_back(1); - out_dims.push_back(1); - y_temp.Resize(phi::make_ddim(y_dims)); - dout_temp.Resize(phi::make_ddim(out_dims)); - y_ndim = 2; - out_ndim += 1; - } - - // Case 2: [M, K] x [K, N] = [M, N] - if (out_ndim == 2) { - if (dX) { - dX->Resize(phi::make_ddim(x_dims)); - if (trans_x) { - MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); - } else { - MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); - } - dX->Resize(X->dims()); - } - if (dY) { - dY->Resize(phi::make_ddim(y_dims)); - if (trans_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); - } - dY->Resize(Y->dims()); - } - return; - } - - const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - - // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false - // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] - if (trans_x == false && y_ndim == 2) { - std::vector x_vec_dim = {x_temp.numel() / K, K}; - dout_temp.Resize( - phi::make_ddim(std::vector{dout_temp.numel() / N, N})); - if (dX) { - dX->Resize(phi::make_ddim(x_vec_dim)); - MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); - dX->Resize(X->dims()); - } - if (dY) { - x_temp.Resize(phi::make_ddim(x_vec_dim)); - if (trans_y) { - MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); - } else { - MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); - } - } - return; - } - - // Case 4: [B, M, K] x [B, K, N] = [B, M, N] - std::vector x_broadcast_dims(out_ndim, 1); - std::vector y_broadcast_dims(out_ndim, 1); - std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); - std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); - std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); - std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - - phi::DenseTensor x_temp_brd(X->type()); - if (x_dims == x_broadcast_dims) { - x_temp_brd.ShareDataWith(*X); - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - } else { - x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims)); - x_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(x_temp) - .AddInput(std::move(x_broadcast_dims)) - .AddOutput(x_temp_brd) - .Run(stream); - } - - phi::DenseTensor y_temp_brd(Y->type()); - if (y_dims == y_broadcast_dims) { - y_temp_brd.ShareDataWith(*Y); - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - } else { - y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims)); - y_temp_brd.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(y_temp) - .AddInput(std::move(y_broadcast_dims)) - .AddOutput(y_temp_brd) - .Run(stream); - } - - if (dX) { - if (x_dims == x_broadcast_dims) { - if (trans_x) { - MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); - } else { - MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); - } - } else { - phi::DenseTensor dx_temp(X->type()); - dx_temp.Resize(phi::make_ddim(x_broadcast_dims)); - if (trans_x) { - MatMulND( - ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, true); - } else { - MatMulND( - ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, !trans_y); - } - ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); - } - } - if (dY) { - if (y_dims == y_broadcast_dims) { - if (trans_y) { - MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); - } else { - MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); - } - } else { - phi::DenseTensor dy_temp(Y->type()); - dy_temp.Resize(phi::make_ddim(y_broadcast_dims)); - if (trans_y) { - MatMulND( - ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, trans_x); - } else { - MatMulND( - ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, false); - } - ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL(matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc deleted file mode 100644 index 3df6a6a04d541..0000000000000 --- a/paddle/fluid/operators/mean_op_npu.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -class MeanNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - std::vector axes; - - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class MeanGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto stream = - context.template device_context() - .stream(); - - auto grad = context.Input(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ( - grad->numel(), - 1, - platform::errors::InvalidArgument( - "Mean Gradient Input phi::DenseTensor len should be 1. But " - "received Out@Grad's elements num is %d.", - grad->numel())); - - auto IG = context.Output(framework::GradVarName("X")); - IG->mutable_data(context.GetPlace()); - - // ones - phi::DenseTensor ones(grad->dtype()); - ones.mutable_data(IG->dims(), context.GetPlace()); - const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); - runner_ones.Run(stream); - - // means - phi::DenseTensor mean_tensor(grad->dtype()); - mean_tensor.Resize({1}); - mean_tensor.mutable_data(context.GetPlace()); - FillNpuTensorWithConstant( - &mean_tensor, static_cast(1.0 / static_cast(IG->numel()))); - - // means mul ones - phi::DenseTensor mean_ma(grad->dtype()); - mean_ma.Resize(IG->dims()); - mean_ma.mutable_data(context.GetPlace()); - const auto& runner_mul_1 = - NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); - runner_mul_1.Run(stream); - - // and mul grad - const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); - runner_mul_2.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - mean, - ops::MeanNPUKernel, - ops::MeanNPUKernel) - -REGISTER_OP_NPU_KERNEL( - mean_grad, - ops::MeanGradNPUKernel, - ops::MeanGradNPUKernel) diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc deleted file mode 100644 index e60af8bd480ea..0000000000000 --- a/paddle/fluid/operators/meshgrid_op_npu.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MeshgridNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto ins = context.MultiInput("X"); - auto outs = context.MultiOutput("Out"); - PADDLE_ENFORCE_EQ( - (ins.size() > 1) && (ins.size() < 7), - true, - platform::errors::InvalidArgument( - "Excepted Tensor numbers between 2 and 6, but only received d% .", - ins.size())); - - int64_t size = ins.size(); - std::vector shape(size); - - for (int64_t i = 0; i < size; i++) { - switch (ins[i]->dims().size()) { - case 0: - shape[i] = 1; - break; - case 1: - shape[i] = ins[i]->dims()[0]; - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected scalar or 1D tensor in the tensor list but got tensor " - "%d: ", - i)); - } - } - - for (int64_t i = 0; i < size; i++) { - std::vector view_shape(size, 1); - view_shape[i] = shape[i]; - - framework::DDim out_dims_reshape = phi::make_ddim(view_shape); - phi::DenseTensor reshape_ins_tensor(ins[i]->dtype()); - reshape_ins_tensor.ShareDataWith(*ins[i]); - reshape_ins_tensor.Resize(out_dims_reshape); - - framework::DDim out_dims = phi::make_ddim(shape); - outs[i]->Resize(out_dims); - outs[i]->mutable_data(context.GetPlace()); - - auto stream = - context.template device_context() - .stream(); - NpuOpRunner runner; - runner.SetType("BroadcastTo") - .AddInput(reshape_ins_tensor) - .AddInput(std::move(shape)) - .AddOutput(*(outs[i])) - .Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - meshgrid, - paddle::operators::MeshgridNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::MeshgridNPUKernel, -#endif - paddle::operators::MeshgridNPUKernel, - paddle::operators::MeshgridNPUKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc deleted file mode 100644 index 094f39366ab35..0000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace operators { - -template -class AccuracyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* label = ctx.Input("Label"); - auto* indices = ctx.Input("Indices"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - auto stream = - ctx.template device_context() - .stream(); - - int num_samples = inference->dims()[0]; - if (num_samples == 0) { - return; - } - - // cast `indices` or `label` if their type is not consistent - Tensor cast_indices(phi::DataType::INT32); - Tensor cast_label(phi::DataType::INT32); - if (indices->dtype() != label->dtype()) { - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - if (framework::TransToProtoVarType(indices->dtype()) != - framework::proto::VarType::INT32) { - cast_indices.Resize(indices->dims()); - cast_indices.mutable_data(ctx.GetPlace()); - const auto& runner_cast_indices = - NpuOpRunner("Cast", - {*indices}, - {cast_indices}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_indices.Run(stream); - } else { - cast_indices.ShareDataWith(*indices); - } - if (framework::TransToProtoVarType(label->dtype()) != - framework::proto::VarType::INT32) { - cast_label.Resize(label->dims()); - cast_label.mutable_data(ctx.GetPlace()); - const auto& runner_cast_label = - NpuOpRunner("Cast", - {*label}, - {cast_label}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_label.Run(stream); - } else { - cast_label.ShareDataWith(*label); - } - } else { - cast_indices.ShareDataWith(*indices); - cast_label.ShareDataWith(*label); - } - - // equal - Tensor tmp_equal(phi::DataType::BOOL); - tmp_equal.Resize(inference->dims()); - tmp_equal.mutable_data(ctx.GetPlace()); - const auto& runner_equal = - NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {}); - runner_equal.Run(stream); - - // cast equal - Tensor tmp_equal_cast(phi::DataType::FLOAT32); - tmp_equal_cast.Resize(inference->dims()); - tmp_equal_cast.mutable_data(ctx.GetPlace()); - const auto& runner_cast_equal = NpuOpRunner( - "Cast", - {tmp_equal}, - {tmp_equal_cast}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(tmp_equal_cast.dtype())))}}); - runner_cast_equal.Run(stream); - - // [correct] - // reduce_max - Tensor tmp_correct_max(phi::DataType::FLOAT32); - tmp_correct_max.Resize(phi::make_ddim({num_samples})); - tmp_correct_max.mutable_data(ctx.GetPlace()); - const auto& runner_reduce_max = - NpuOpRunner("ReduceMaxD", - {tmp_equal_cast}, - {tmp_correct_max}, - {{"axes", std::vector{1}}, {"keep_dims", false}}); - runner_reduce_max.Run(stream); - - // reduce_sum - Tensor tmp_correct(phi::DataType::FLOAT32); - tmp_correct.Resize(correct->dims()); - tmp_correct.mutable_data(ctx.GetPlace()); - const auto& runner_reduce_sum = - NpuOpRunner("ReduceSumD", - {tmp_correct_max}, - {tmp_correct}, - {{"axes", std::vector{0}}, {"keep_dims", false}}); - runner_reduce_sum.Run(stream); - - // cast to int - correct->mutable_data(ctx.GetPlace()); - const auto& runner_cast_correct = - NpuOpRunner("Cast", - {tmp_correct}, - {*correct}, - {{"dst_type", - static_cast(ConvertToNpuDtype( - framework::TransToProtoVarType(correct->dtype())))}}); - runner_cast_correct.Run(stream); - - // [total] - total->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(total, static_cast(num_samples)); - - // use `total` of type `float32` for calculating accuracy - Tensor tmp_total(phi::DataType::FLOAT32); - tmp_total.Resize(total->dims()); - tmp_total.mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(&tmp_total, - static_cast(num_samples)); - - // [accuracy] - accuracy->mutable_data(ctx.GetPlace()); - const auto& runner_accuracy = - NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {}); - runner_accuracy.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - accuracy, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel, - ops::AccuracyNPUKernel); diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc deleted file mode 100644 index d8b713de96fff..0000000000000 --- a/paddle/fluid/operators/mul_op_npu.cc +++ /dev/null @@ -1,274 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class MulNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - int x_num_col_dims = ctx.Attr("x_num_col_dims"); - int y_num_col_dims = ctx.Attr("y_num_col_dims"); - auto stream = - ctx.template device_context() - .stream(); - if (x_num_col_dims == 1 && y_num_col_dims == 1) { - if (x->dims().size() == 2 && y->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("MatMul", - {*x, *y}, - {*out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner.Run(stream); - } else if (x->dims().size() >= 3 && y->dims().size() == 2) { - // reshape - Tensor tmp_x(x->type()); - int64_t sec_dim = x->dims()[1]; - for (auto i = 2; i < x->dims().size(); i++) { - sec_dim *= x->dims()[i]; - } - int64_t first_dim = x->dims()[0]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - out->mutable_data(ctx.GetPlace()); - // matmul - const auto& runner = - NpuOpRunner("MatMul", - {tmp_x, *y}, - {*out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner.Run(stream); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("npu error: not support dims")); - } - // to do other - } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] - PADDLE_ENFORCE_EQ(x_num_col_dims, - 2, - platform::errors::InvalidArgument( - "now only support x_num_col_dims == 2: but got %d", - x_num_col_dims)); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(y->dtype()) == - framework::proto::VarType::FP16) { - // NOTE: When the dim of the input and output shapes is inconsistent, - // (Boradcast) BatchMatMul NPU OP only support FP16. - out->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("BatchMatMul", - {*x, *y}, - {*out}, - {{"adj_x1", false}, {"adj_x2", false}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - // flatten => x.shape=[6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - - // matmul [6,4] , [4, 5] => [6, 5] - out->mutable_data(ctx.GetPlace()); - - Tensor tmp_out(x->type()); - tmp_out.ShareDataWith(*out); - tmp_out.Resize(phi::make_ddim({first_dim, y->dims()[1]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {tmp_x, *y}, - {tmp_out}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - runner_matmul.Run(stream); - } - } - } -}; - -template -class MulGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int x_num_col_dims = ctx.Attr("x_num_col_dims"); - int y_num_col_dims = ctx.Attr("y_num_col_dims"); - auto stream = - ctx.template device_context() - .stream(); - if (x_num_col_dims == 1 && y_num_col_dims == 1) { - if (x->dims().size() == 2 && y->dims().size() == 2) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", - {*dout, *y}, - {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - - runner_dx.Run(stream); - } - - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {*x, *dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - - runner_dy.Run(stream); - } - } else if (x->dims().size() >= 3 && y->dims().size() == 2) { - // flatten => x.shape=[6, 4] - // matmul - if (dx) { - // matmul [2, 5] * [12, 5] => [2, 12] - dx->mutable_data(ctx.GetPlace()); - Tensor tmp_dx(x->type()); - tmp_dx.ShareDataWith(*dx); - tmp_dx.Resize(phi::make_ddim({dout->dims()[0], y->dims()[0]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {*dout, *y}, - {tmp_dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_matmul.Run(stream); - } - - if (dy) { - // flatten - Tensor tmp_x(x->type()); - int64_t sec_dim = x->dims()[1]; - for (auto i = 2; i < x->dims().size(); i++) { - sec_dim *= x->dims()[i]; - } - int64_t first_dim = x->dims()[0]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {tmp_x, *dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - - runner_dy.Run(stream); - } - } - } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] - PADDLE_ENFORCE_EQ(x_num_col_dims, - 2, - platform::errors::InvalidArgument( - "now only support x_num_col_dims == 2: but got %d", - x_num_col_dims)); - // tmp_dout both used by dx and dy - Tensor tmp_dout(x->type()); - int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; - int64_t dout_sec_dim = dout->dims()[2]; - tmp_dout.ShareDataWith(*dout); - tmp_dout.Resize(phi::make_ddim({dout_first_dim, dout_sec_dim})); - - if (dx) { - // tmp_dout * y [2, 3, 5] * [4,5] => [2, 3, 4] - if (framework::TransToProtoVarType(dout->dtype()) == - framework::proto::VarType::FP16 && - framework::TransToProtoVarType(y->dtype()) == - framework::proto::VarType::FP16) { - // NOTE: When the dim of the input and output shapes is inconsistent, - // (Boradcast) BatchMatMul NPU OP only support FP16. - dx->mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("BatchMatMul", - {*dout, *y}, - {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } else { - dx->mutable_data(ctx.GetPlace()); - Tensor tmp_dx(x->type()); - tmp_dx.ShareDataWith(*dx); - tmp_dx.Resize(phi::make_ddim({dout_first_dim, y->dims()[0]})); - - const auto& runner_matmul = - NpuOpRunner("MatMul", - {tmp_dout, *y}, - {tmp_dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_matmul.Run(stream); - } - } - if (dy) { - // flatten x.shape [2,3,4] => [6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.ShareDataWith(*x); - tmp_x.Resize(phi::make_ddim({first_dim, sec_dim})); - // mamtul [6,4] [6,5] =>[4,5] - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", - {tmp_x, tmp_dout}, - {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - mul, - ops::MulNPUKernel, - ops::MulNPUKernel); -REGISTER_OP_NPU_KERNEL( - mul_grad, - ops::MulGradNPUKernel, - ops::MulGradNPUKernel); diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc deleted file mode 100644 index 425b7c6738633..0000000000000 --- a/paddle/fluid/operators/multinomial_op_npu.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in -// cmake/operators.cmake when Paddle supports -#if (CANN_VERSION_CODE >= 504000) - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class NPUMultinomialKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); - const int64_t num_samples = ctx.Attr("num_samples"); - const bool replacement = ctx.Attr("replacement"); - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - out->mutable_data(place); - - const auto& runner = NpuOpRunner( - "MultinomialWithReplacementD", - {*x}, - {*out}, - {{"num_samples", num_samples}, {"replacement", replacement}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - multinomial, - ops::NPUMultinomialKernel, - ops::NPUMultinomialKernel) -#endif diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc deleted file mode 100644 index b839b3e8ec2e0..0000000000000 --- a/paddle/fluid/operators/norm_op_npu.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using DDim = framework::DDim; - -void CheckAxis(int axis, int rank) { - // check the axis is in [-rank, rank-1] - if (axis <= rank - 1 && axis >= -rank) return; - PADDLE_THROW(platform::errors::InvalidArgument( - "axis in norm operator must between (%d) and (%d)" - "but got (%d).", - -rank, - rank - 1, - axis)); -} - -template -class NormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl; - auto *in_x = ctx.Input("X"); - auto *out_y = ctx.Output("Out"); - auto *out_norm = ctx.Output("Norm"); - out_y->mutable_data(ctx.GetPlace()); - out_norm->mutable_data(ctx.GetPlace()); - auto xdim = in_x->dims(); - float eps = ctx.Attr("epsilon"); - int axis = ctx.Attr("axis"); - CheckAxis(axis, xdim.size()); - if (axis < 0) axis = xdim.size() + axis; - - framework::NPUAttributeMap attr_input_norm; - attr_input_norm["axes"] = std::vector({axis}); - attr_input_norm["p"] = 2; - attr_input_norm["keepdim"] = true; - attr_input_norm["epsilon"] = eps; - const auto &runner = - NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - NpuOpRunner("Div", {*in_x, *out_norm}, {*out_y}, {}).Run(stream); - } -}; - -template -class NormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - float epsilon = ctx.Attr("epsilon"); - int axis = ctx.Attr("axis"); - - auto *x = ctx.Input("X"); - auto *y = ctx.Input("Out"); - auto *dy = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - - auto xdim = x->dims(); - CheckAxis(axis, xdim.size()); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - framework::NPUAttributeMap attr_input_norm; - attr_input_norm["dim"] = std::vector({axis}); - attr_input_norm["eps"] = epsilon; - const auto &runner = - NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - norm, - ops::NormNPUKernel, - ops::NormNPUKernel) - -REGISTER_OP_NPU_KERNEL( - norm_grad, - ops::NormGradNPUKernel, - ops::NormGradNPUKernel); diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc deleted file mode 100644 index e44f6286afa9b..0000000000000 --- a/paddle/fluid/operators/one_hot_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/one_hot_op.h" - -namespace paddle { -namespace operators { - -template -class OneHotNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* in = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int depth = ctx.Attr("depth"); - - if (ctx.HasInput("depth_tensor")) { - auto* depth_tensor = ctx.Input("depth_tensor"); - std::vector depth_data; - framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data); - depth = depth_data[0]; - auto in_dims = in->dims(); - framework::DDim out_dims(in_dims); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - out->mutable_data(ctx.GetPlace()); - - float on_value = 1.0f, off_value = 0.0f; - if (framework::TransToProtoVarType(in->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(*in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } else { - phi::DenseTensor transformed_in; - transformed_in.mutable_data(in->dims(), dev_ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(dev_ctx.stream()); - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(transformed_in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(one_hot, - ops::OneHotNPUKernel, - ops::OneHotNPUKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc deleted file mode 100644 index b213d3345d1f0..0000000000000 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class OneHotV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - auto* in = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int depth = ctx.Attr("depth"); - - if (ctx.HasInput("depth_tensor")) { - auto* depth_tensor = ctx.Input("depth_tensor"); - std::vector depth_data; - framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data); - depth = depth_data[0]; - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - out->mutable_data(ctx.GetPlace()); - - float on_value = 1.0f, off_value = 0.0f; - if (framework::TransToProtoVarType(in->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(*in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } else { - phi::DenseTensor transformed_in; - transformed_in.mutable_data(in->dims(), dev_ctx.GetPlace()); - const auto& cast_runner = NpuOpRunner( - "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}}); - cast_runner.Run(dev_ctx.stream()); - NpuOpRunner runner; - runner.SetType("OneHot") - .AddInput(transformed_in) - .AddInput(std::vector({static_cast(depth)})) - .AddInput(std::vector({on_value})) - .AddInput(std::vector({off_value})) - .AddAttr("axis", -1) - .AddOutput(*out); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(one_hot_v2, - ops::OneHotV2NPUKernel, - ops::OneHotV2NPUKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc deleted file mode 100644 index 3324e56b3b95f..0000000000000 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ /dev/null @@ -1,345 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class AdamNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - auto* param = ctx.Input("Param"); - auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Grad(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(param_var->Type()))); - auto* grad = ctx.Input("Grad"); - auto* mom1 = ctx.Input("Moment1"); - auto* mom2 = ctx.Input("Moment2"); - auto* lr = ctx.Input("LearningRate"); - - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); - - auto* param_out = ctx.Output("ParamOut"); - auto* mom1_out = ctx.Output("Moment1Out"); - auto* mom2_out = ctx.Output("Moment2Out"); - auto* beta1_pow_out = ctx.Output("Beta1PowOut"); - auto* beta2_pow_out = ctx.Output("Beta2PowOut"); - - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector( - *skip_update_tensor, ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update) { - VLOG(4) << "Adam skip update"; - framework::TensorCopy( - *param, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - framework::TensorCopy( - *mom1, - ctx.GetPlace(), - ctx.template device_context(), - mom1_out); - framework::TensorCopy( - *mom2, - ctx.GetPlace(), - ctx.template device_context(), - mom2_out); - framework::TensorCopy( - *beta1_pow, - beta1_pow->place(), - ctx.template device_context(), - beta1_pow_out); - framework::TensorCopy( - *beta2_pow, - beta2_pow->place(), - ctx.template device_context(), - beta2_pow_out); - return; - } - - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - param_out->mutable_data(ctx.GetPlace()); - mom1_out->mutable_data(ctx.GetPlace()); - mom2_out->mutable_data(ctx.GetPlace()); - - // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform - // place. - phi::DenseTensor beta1_pow_tmp; - phi::DenseTensor beta2_pow_tmp; - if (beta1_pow->place() == platform::CPUPlace()) { - T beta1 = *beta1_pow->data(); - beta1_pow_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta1_pow_tmp, beta1); - beta1_pow = &beta1_pow_tmp; - } - if (beta2_pow->place() == platform::CPUPlace()) { - T beta2 = *beta2_pow->data(); - beta2_pow_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta2_pow_tmp, beta2); - beta2_pow = &beta2_pow_tmp; - } - - const phi::DenseTensor* beta1_tensor = nullptr; - const phi::DenseTensor* beta2_tensor = nullptr; - const phi::DenseTensor* epsilon_tensor = nullptr; - - phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32); - phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32); - phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32); - - if (ctx.HasInput("Beta1Tensor")) { - beta1_tensor = ctx.Input("Beta1Tensor"); - PADDLE_ENFORCE_EQ(beta1_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(Beta1Tensor) size must be 1, but get %d", - beta1_tensor->numel())); - } else { - T beta1 = static_cast(ctx.Attr("beta1")); - beta1_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta1_tmp, beta1); - beta1_tensor = &beta1_tmp; - } - - if (ctx.HasInput("Beta2Tensor")) { - beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(Beta2Tensor) size must be 1, but get %d", - beta2_tensor->numel())); - } else { - T beta2 = static_cast(ctx.Attr("beta2")); - beta2_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta2_tmp, beta2); - beta2_tensor = &beta2_tmp; - } - - if (ctx.HasInput("EpsilonTensor")) { - epsilon_tensor = ctx.Input("EpsilonTensor"); - PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(EpsilonTensor) size must be 1, but get %d", - epsilon_tensor->numel())); - } else { - T epsilon = static_cast(ctx.Attr("epsilon")); - epsilon_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&epsilon_tmp, epsilon); - epsilon_tensor = &epsilon_tmp; - } - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() - << "beta2_pow.numel() : " << beta2_pow->numel(); - VLOG(3) << "param.numel(): " << param->numel(); - - PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), - 1, - platform::errors::InvalidArgument( - "beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), - 1, - platform::errors::InvalidArgument( - "beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ApplyAdamD", - { - *param, - *mom1, - *mom2, - *beta1_pow, - *beta2_pow, - *lr, - *beta1_tensor, - *beta2_tensor, - *epsilon_tensor, - *grad, - }, - { - *param_out, - *mom1_out, - *mom2_out, - }, - {}); - runner.Run(stream); - - // NOTE(zhiqiu): ApplyAdamD updates params inplace, so - // if param and param_out is not same, we need to do copy. - if (param_out->data() != param->data()) { - framework::TensorCopy( - *param, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - } - if (mom1_out->data() != mom1->data()) { - framework::TensorCopy( - *mom1, - ctx.GetPlace(), - ctx.template device_context(), - mom1_out); - } - if (mom2_out->data() != mom2->data()) { - framework::TensorCopy( - *mom2, - ctx.GetPlace(), - ctx.template device_context(), - mom2_out); - } - if (!use_global_beta_pow) { - beta1_pow_out->mutable_data(ctx.GetPlace()); - beta2_pow_out->mutable_data(ctx.GetPlace()); - const auto& runner_m1 = - NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); - runner_m1.Run(stream); - const auto& runner_m2 = - NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); - runner_m2.Run(stream); - } - } -}; - -template -class AdamWNPUKernel : public AdamNPUKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - VLOG(3) << "NPU AdamW Kernel"; - bool skip_update = false; - if (ctx.HasInput("SkipUpdate")) { - VLOG(3) << "Has SkipUpdate"; - auto* skip_update_tensor = ctx.Input("SkipUpdate"); - PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), - 1, - platform::errors::InvalidArgument( - "Input(SkipUpdate) size must be 1, but get %d", - skip_update_tensor->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector( - *skip_update_tensor, ctx.device_context(), &skip_update_vec); - skip_update = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update; - bool with_decay = ctx.Attr("with_decay"); - if (!skip_update && with_decay) { - float coeff = ctx.Attr("coeff"); - auto* lr = ctx.Input("LearningRate"); - - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor one(phi::DataType::FLOAT32); - phi::DenseTensor decay(phi::DataType::FLOAT32); - phi::DenseTensor tmp(phi::DataType::FLOAT32); - - tmp.mutable_data({1}, place); - one.mutable_data({1}, place); - decay.mutable_data({1}, place); - - FillNpuTensorWithConstant(&one, 1.0f); - framework::NPUAttributeMap attr_input = {{"value", coeff}}; - - const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input); - runner1.Run(stream); - - const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {}); - runner2.Run(stream); - - if (ctx.HasInput("MasterParam")) { - PADDLE_THROW(platform::errors::Unimplemented( - "Master Parma is not supported on npu")); - } else { - auto* param_out = ctx.Output("ParamOut"); - param_out->mutable_data(ctx.GetPlace()); - - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be phi::DenseTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - auto* param = ctx.Input("Param"); - - const auto& runner = - NpuOpRunner("Mul", - {*param, decay}, - {*const_cast(param)}, - {}); - runner.Run(stream); - } - } - AdamNPUKernel::Compute(ctx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - adam, - ops::AdamNPUKernel, - ops::AdamNPUKernel); - -REGISTER_OP_NPU_KERNEL(adamw, - ops::AdamWNPUKernel, - ops::AdamWNPUKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc deleted file mode 100644 index 83c805a1f642a..0000000000000 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" - -namespace paddle { -namespace operators { - -template -class NPUMergedMomentumOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto params = ctx.MultiInput("Param"); - auto params_out = ctx.MultiOutput("ParamOut"); - size_t n = params.size(); - PADDLE_ENFORCE_EQ(n, - params_out.size(), - platform::errors::InvalidArgument( - "The size of Output(ParamOut) must be equal to " - "Input(Param), but got the size of Output(ParamOut) " - "is %d, the size of Input(Param) is %d.", - params_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(params[i], - params_out[i], - platform::errors::InvalidArgument( - "The size of Input(Param) and Output(ParamOut) " - "must be the same Tensors.")); - } - - auto grads = ctx.MultiInput("Grad"); - PADDLE_ENFORCE_EQ( - n, - grads.size(), - platform::errors::InvalidArgument( - "The size of Input(Grad) must be equal to Input(Param), but got " - "the size of Input(Grad) is %d, the size of Input(Param) is %d.", - grads.size(), - n)); - - auto velocitys = ctx.MultiInput("Velocity"); - PADDLE_ENFORCE_EQ(n, - velocitys.size(), - platform::errors::InvalidArgument( - "The size of Input(Velocity) must be equal to " - "Input(Param), but got the size of Input(Velocity) " - "is %d, the size of Input(Param) is %d.", - velocitys.size(), - n)); - - auto velocitys_out = ctx.MultiOutput("VelocityOut"); - PADDLE_ENFORCE_EQ( - n, - velocitys_out.size(), - platform::errors::InvalidArgument( - "The size of Output(VelocityOut) must be " - "equal to Input(Param), but got the size of Output(VelocityOut) is " - "%d, the size of Input(Param) is %d.", - velocitys_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(velocitys[i], - velocitys_out[i], - platform::errors::InvalidArgument( - "Input(Velocity) and Output(VelocityOut) must be " - "the same Tensors.")); - } - - T mu = static_cast(ctx.Attr("mu")); - auto lrs = ctx.MultiInput("LearningRate"); - if (lrs.size() != 1) { - PADDLE_ENFORCE_EQ( - n, - lrs.size(), - platform::errors::InvalidArgument( - "If the size of Input(LearningRate) is not 1, the size of " - "Input(LearningRate) must be " - "equal to Input(Param), but got the size of Input(LearningRate) " - "is %d, the size of Input(Param) is %d.", - lrs.size(), - n)); - } - auto use_nesterov = ctx.Attr("use_nesterov"); - auto regularization_methods = - ctx.Attr>("regularization_method"); - auto regularization_coeffs = - ctx.Attr>("regularization_coeff"); - if (regularization_methods.size() != 0) { - PADDLE_ENFORCE_EQ( - n, - regularization_methods.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_method) must be equal " - "to Input(Param), but got the size of " - "Attr(regularization_method) is %d, the size of Input(Param) is " - "%d.", - regularization_methods.size(), - n)); - PADDLE_ENFORCE_EQ( - n, - regularization_coeffs.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_coeff) must be equal " - "to Input(Param), but got the size of Attr(regularization_coeff) " - "is %d, the size of Input(Param) is %d.", - regularization_coeffs.size(), - n)); - } - - VLOG(5) << "use_nesterov: " << use_nesterov - << ", regularization_methods.size(): " - << regularization_methods.size() - << ", regularization_coeffs.size(): " - << regularization_coeffs.size(); - - auto& dev_ctx = ctx.template device_context(); - - Tensor mu_tensor; - mu_tensor.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); - FillNpuTensorWithConstant(&mu_tensor, mu); - - for (size_t idx = 0; idx < n; ++idx) { - phi::RegularizationType regularization_flag = - regularization_methods.size() > 0 && - regularization_methods[idx] == "l2_decay" - ? phi::RegularizationType::kL2DECAY - : phi::RegularizationType::kNONE; - float regularization_coeff = 0.0; - if (regularization_coeffs.size() != 0) { - regularization_coeff = regularization_coeffs[idx]; - } - - auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; - auto param = params[idx]; - auto param_out = params_out[idx]; - auto velocity = velocitys[idx]; - auto velocity_out = velocitys_out[idx]; - - auto grad = grads[idx]; - Tensor regularized_grad; - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - regularized_grad.mutable_data(grad->dims(), ctx.GetPlace()); - const auto& runner1 = NpuOpRunner("Muls", - {*param}, - {regularized_grad}, - {{"value", regularization_coeff}}); - runner1.Run(dev_ctx.stream()); - const auto& runner2 = NpuOpRunner( - "Add", {regularized_grad, *grad}, {regularized_grad}, {}); - runner2.Run(dev_ctx.stream()); - } else { - regularized_grad.ShareDataWith(*grad); - } - framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); - framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); - // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner("ApplyMomentum", - {*param_out, - *velocity_out, - *learning_rate, - regularized_grad, - mu_tensor}, - {*param_out}, - {{"use_nesterov", use_nesterov}}); - runner.Run(dev_ctx.stream()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(merged_momentum, - ops::NPUMergedMomentumOpKernel, - ops::NPUMergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op_npu.cc b/paddle/fluid/operators/optimizers/momentum_op_npu.cc deleted file mode 100644 index a5349e05b9b02..0000000000000 --- a/paddle/fluid/operators/optimizers/momentum_op_npu.cc +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/optimizers/momentum_op.h" -#include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" - -namespace paddle { -namespace operators { - -template -class NPUMomentumOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - - std::string regularization_method = - ctx.Attr("regularization_method"); - auto regularization_coeff = ctx.Attr("regularization_coeff"); - phi::RegularizationType regularization_flag{ - phi::RegularizationType::kNONE}; // disable regularization - if (regularization_method == "l2_decay") { - regularization_flag = phi::RegularizationType::kL2DECAY; - } - - T mu = static_cast(ctx.Attr("mu")); - bool use_nesterov = ctx.Attr("use_nesterov"); - - auto learning_rate = ctx.Input("LearningRate"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); - - auto* grad_var = ctx.InputVar("Grad"); - if (grad_var->IsType()) { - auto grad = ctx.Input("Grad"); - Tensor mu_tensor; - mu_tensor.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); - FillNpuTensorWithConstant(&mu_tensor, mu); - - Tensor regularized_grad; - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - regularized_grad.mutable_data(grad->dims(), ctx.GetPlace()); - const auto& runner1 = NpuOpRunner("Muls", - {*param}, - {regularized_grad}, - {{"value", regularization_coeff}}); - runner1.Run(dev_ctx.stream()); - const auto& runner2 = NpuOpRunner( - "Add", {regularized_grad, *grad}, {regularized_grad}, {}); - runner2.Run(dev_ctx.stream()); - } else { - regularized_grad.ShareDataWith(*grad); - } - framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out); - framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out); - // NOTE: ApplyMomentum will change the input - const auto& runner = NpuOpRunner("ApplyMomentum", - {*param_out, - *velocity_out, - *learning_rate, - regularized_grad, - mu_tensor}, - {*param_out}, - {{"use_nesterov", use_nesterov}}); - runner.Run(dev_ctx.stream()); - } else if (grad_var->IsType()) { - PADDLE_ENFORCE_EQ( - false, - true, - platform::errors::PermissionDenied("Unsupport SparseMomentum")); - } else { - PADDLE_ENFORCE_EQ(false, - true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad " - "in MomentumOp. Excepted LodTensor " - "or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(momentum, - ops::NPUMomentumOpKernel, - ops::NPUMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc deleted file mode 100644 index 6ee01272f47e8..0000000000000 --- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class RMSPROPNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *grad_var = ctx.InputVar("Grad"); - auto *param_out = ctx.Output("ParamOut"); - auto *moment_out = ctx.Output("MomentOut"); - auto *mean_square_out = ctx.Output("MeanSquareOut"); - - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - mean_square_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - auto rho = static_cast(ctx.Attr("decay")); - auto momentum = static_cast(ctx.Attr("momentum")); - auto *p_tensor = ctx.Input("Param"); - auto *ms_tensor = ctx.Input("MeanSquare"); - auto *lr_tensor = ctx.Input("LearningRate"); - auto *mom_tensor = ctx.Input("Moment"); - bool centered = ctx.Attr("centered"); - - auto stream = - ctx.template device_context() - .stream(); - if (grad_var->IsType()) { - auto *grad_tensor = ctx.Input("Grad"); - if (centered) { - framework::NPUAttributeMap attr_input = {{"use_locking", false}}; - const phi::DenseTensor *rho_tensor = nullptr; - const phi::DenseTensor *momentum_tensor = nullptr; - const phi::DenseTensor *epsilon_tensor = nullptr; - phi::DenseTensor rho_tmp(phi::DataType::FLOAT32); - rho_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&rho_tmp, rho); - rho_tensor = &rho_tmp; - phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32); - momentum_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&momentum_tmp, momentum); - momentum_tensor = &momentum_tmp; - phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32); - epsilon_tmp.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&epsilon_tmp, epsilon); - epsilon_tensor = &epsilon_tmp; - auto *mg_tensor = ctx.Input("MeanGrad"); - auto *mean_grad_out = ctx.Output("MeanGradOut"); - mean_grad_out->mutable_data(ctx.GetPlace()); - const auto &runner_applycenterrmsprop = NpuOpRunner( - std::string("ApplyCenteredRMSPropD"), - {*p_tensor, - *mg_tensor, - *ms_tensor, - *mom_tensor, - *lr_tensor, - *rho_tensor, - *momentum_tensor, - *epsilon_tensor, - *grad_tensor}, - {*param_out, *mean_grad_out, *mean_square_out, *moment_out}, - {attr_input}); - runner_applycenterrmsprop.Run(stream); - } else { - framework::NPUAttributeMap attr_input = { - {"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}}; - const auto &runner_applyrmsprop = NpuOpRunner( - std::string("ApplyRMSPropD"), - {*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor}, - {*param_out, *mean_square_out, *moment_out}, - {attr_input}); - runner_applyrmsprop.Run(stream); - } - } else { - PADDLE_ENFORCE_EQ(false, - true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad " - "in RmspropOp. Excepted LodTensor, " - "But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - rmsprop, ops::RMSPROPNPUKernel) diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc deleted file mode 100644 index 7bd5cf8793cd0..0000000000000 --- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/optimizers/sgd_op.h" - -namespace paddle { -namespace operators { - -template -class SGDNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* learning_rate = ctx.Input("LearningRate"); - auto* param_var = ctx.Input("Param"); - auto* grad_var = ctx.Input("Grad"); - auto* param_out = ctx.Output("ParamOut"); - - param_out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("ApplyGradientDescent", - {*param_var, *learning_rate, *grad_var}, - {*param_out}, - {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so - // if param and param_out is not same, we need to do copy. - if (param_out->data() != param_var->data()) { - framework::TensorCopy( - *param_var, - ctx.GetPlace(), - ctx.template device_context(), - param_out); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - sgd, - ops::SGDNPUKernel, - ops::SGDNPUKernel, - ops::SGDNPUKernel); diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc deleted file mode 100644 index c2d99fa42f2f8..0000000000000 --- a/paddle/fluid/operators/p_norm_op_npu.cc +++ /dev/null @@ -1,228 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class PnormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_x = ctx.Input("X"); - auto* out_norm = ctx.Output("Out"); - out_norm->mutable_data(ctx.GetPlace()); - - float porder = ctx.Attr("porder"); - int axis = ctx.Attr("axis"); - bool keepdim = ctx.Attr("keepdim"); - - auto xdim = in_x->dims(); - if (axis < 0) axis = xdim.size() + axis; - - auto stream = - ctx.template device_context() - .stream(); - - int p = 0; - bool combine_op = - !(porder == 0 || porder == INFINITY || porder == -INFINITY); - if (porder == INFINITY) { - p = INT_MAX; - } else if (porder == -INFINITY) { - p = INT_MIN; - } else { - p = static_cast(porder); - float t = 0; - float diff = abs(std::modf(porder, &t)); - if (diff < 1e-5) { - combine_op = false; - } - } - - if (!combine_op) { - const auto& runner = NpuOpRunner("LpNorm", - {*in_x}, - {*out_norm}, - {{"p", p}, - {"axes", std::vector({axis})}, - {"keep_dims", keepdim}}); - runner.Run(stream); - } else { - phi::DenseTensor tmp_x; - tmp_x.mutable_data(xdim, ctx.GetPlace()); - - const auto& power_runner1 = - NpuOpRunner("Power", - {*in_x}, - {tmp_x}, - {{"power", porder}, {"scale", 1.0f}, {"shift", 0.0f}}); - power_runner1.Run(stream); - - const auto& reduce_runner = NpuOpRunner( - "ReduceSumD", - {tmp_x}, - {*out_norm}, - {{"axes", std::vector({axis})}, {"keep_dims", keepdim}}); - reduce_runner.Run(stream); - - const auto& power_runner2 = NpuOpRunner( - "Power", - {*out_norm}, - {*out_norm}, - {{"power", 1 / porder}, {"scale", 1.0f}, {"shift", 0.0f}}); - power_runner2.Run(stream); - } - } -}; - -template -class PnormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Out"); - auto* dy = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - dx->mutable_data(place); - - auto xdim = x->dims(); - float porder = ctx.Attr("porder"); - bool keepdim = ctx.Attr("keepdim"); - - int axis = ctx.Attr("axis"); - axis = axis < 0 ? xdim.size() + axis : axis; - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor y_share(y->type()); - phi::DenseTensor dy_share(dy->type()); - y_share.ShareDataWith(*y); - dy_share.ShareDataWith(*dy); - auto ydim = xdim; - if (!keepdim) { - ydim[axis] = 1; - } else { - ydim = y->dims(); - } - y_share.Resize(ydim); - dy_share.Resize(ydim); - - if (porder == 0) { - FillNpuTensorWithConstant(dx, static_cast(0)); - dx->Resize(xdim); - } else if (porder == INFINITY || porder == -INFINITY) { - phi::DenseTensor x_abs; - x_abs.mutable_data(xdim, place); - const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); - r_abs.Run(stream); - - phi::DenseTensor t_cond; - t_cond.mutable_data(xdim, place); - const auto& r_equal = - NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {}); - r_equal.Run(stream); - - phi::DenseTensor t_zero; - t_zero.mutable_data({1}, place); - FillNpuTensorWithConstant(&t_zero, static_cast(0)); - - phi::DenseTensor x_sign; - x_sign.mutable_data(xdim, place); - const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); - r_sign.Run(stream); - - const auto& r_mul = NpuOpRunner("Mul", {x_sign, dy_share}, {*dx}, {}); - r_mul.Run(stream); - - const auto& r_sel = - NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {}); - r_sel.Run(stream); - } else { - phi::DenseTensor x_abs; - x_abs.mutable_data(xdim, place); - const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); - r_abs.Run(stream); - - phi::DenseTensor x_sign; - x_sign.mutable_data(xdim, place); - const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); - r_sign.Run(stream); - - phi::DenseTensor y_pow; - y_pow.mutable_data(ydim, place); - if (porder >= 1) { - const auto& r_pow1 = NpuOpRunner( - "Power", - {x_abs}, - {x_abs}, - {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow1.Run(stream); - - const auto& r_pow2 = NpuOpRunner( - "Power", - {y_share}, - {y_pow}, - {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow2.Run(stream); - - const auto& r_div = NpuOpRunner("DivNoNan", {x_abs, y_pow}, {*dx}, {}); - r_div.Run(stream); - } else { - const auto& r_pow1 = NpuOpRunner( - "Power", - {x_abs}, - {x_abs}, - {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow1.Run(stream); - - const auto& r_pow2 = NpuOpRunner( - "Power", - {y_share}, - {y_pow}, - {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); - r_pow2.Run(stream); - - const auto& r_div = NpuOpRunner("DivNoNan", {y_pow, x_abs}, {*dx}, {}); - r_div.Run(stream); - } - - const auto& r_mul1 = NpuOpRunner("Mul", {*dx, x_sign}, {*dx}, {}); - r_mul1.Run(stream); - - const auto& r_mul2 = NpuOpRunner("Mul", {*dx, dy_share}, {*dx}, {}); - r_mul2.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - p_norm, - ops::PnormNPUKernel, - ops::PnormNPUKernel); - -REGISTER_OP_NPU_KERNEL( - p_norm_grad, - ops::PnormGradNPUKernel, - ops::PnormGradNPUKernel); diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc deleted file mode 100644 index 0f45d0b51c837..0000000000000 --- a/paddle/fluid/operators/pad3d_op_npu.cc +++ /dev/null @@ -1,147 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -static inline std::vector GetPaddings( - const framework::ExecutionContext& context) { - std::vector paddings(6); - auto* paddings_t = context.Input("Paddings"); - if (paddings_t) { - paddle::framework::TensorToVector( - *paddings_t, context.device_context(), &paddings); - } else { - auto pads = context.Attr>("paddings"); - std::copy(pads.begin(), pads.end(), paddings.data()); - } - return paddings; -} - -template -class Pad3dNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto in_dims = x->dims(); - - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - float value = context.Attr("value"); - auto data_format = context.Attr("data_format"); - - auto* out = context.Output("Out"); - - PADDLE_ENFORCE_LT(abs(value), - 1e-5, - platform::errors::Unimplemented( - "Ascend npu only support constant_values=0 right now," - "but received constant_value is %f .", - value)); - - PADDLE_ENFORCE_EQ(mode, - "constant", - platform::errors::Unimplemented( - "Ascend npu only support mode=constant right now," - "but received mode is %s .", - mode)); - - std::vector paddings( - {0, 0, 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1]}); - if (data_format == "NCDHW") { - out->Resize({in_dims[0], - in_dims[1], - in_dims[2] + pads[4] + pads[5], - in_dims[3] + pads[2] + pads[3], - in_dims[4] + pads[0] + pads[1]}); - } else { - out->Resize({in_dims[0], - in_dims[1] + pads[4] + pads[5], - in_dims[2] + pads[2] + pads[3], - in_dims[3] + pads[0] + pads[1], - in_dims[4]}); - paddings = { - 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1], 0, 0}; - } - out->mutable_data(context.GetPlace()); - - NpuOpRunner runner; - runner.SetType("PadV3") - .AddInput(*x) - .AddInput(std::move(paddings)) - .AddInput( - std::vector({0})) // npu only support constant_value=0 now - .AddOutput(*out) - .AddAttr("mode", mode); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class Pad3dGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector pads = GetPaddings(context); - auto mode = context.Attr("mode"); - auto data_format = context.Attr("data_format"); - - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_in = context.Output(framework::GradVarName("X")); - auto d_in_dims = d_in->dims(); - d_in->mutable_data(context.GetPlace()); - - const int pad_left = pads[0]; - const int pad_top = pads[2]; - const int pad_front = pads[4]; - - auto stream = - context.template device_context() - .stream(); - - std::vector size( - {d_in_dims[0], d_in_dims[1], d_in_dims[2], d_in_dims[3], d_in_dims[4]}); - if (mode == "constant") { // this method can be only used for constant mode - std::vector offsets({0, 0, pad_front, pad_top, pad_left}); - if (data_format == "NDHWC") { - offsets = {0, pad_front, pad_top, pad_left, 0}; - } - const auto& runner = NpuOpRunner( - "SliceD", {*d_out}, {*d_in}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(pad3d, - ops::Pad3dNPUKernel, - ops::Pad3dNPUKernel, - ops::Pad3dNPUKernel); - -REGISTER_OP_NPU_KERNEL(pad3d_grad, - ops::Pad3dNPUKernel, - ops::Pad3dGradNPUKernel); diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc deleted file mode 100644 index 48c2254b1ec91..0000000000000 --- a/paddle/fluid/operators/pad_op_npu.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class PadNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto paddings = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - - PADDLE_ENFORCE_LT(abs(pad_value), - 1e-5, - platform::errors::Unimplemented( - "Ascend npu only support pad_value=0 right now," - "but received pad_value is %f .", - pad_value)); - - out->mutable_data(context.GetPlace()); - - NpuOpRunner runner; - runner.SetType("Pad") - .AddInput(*x) - .AddInput(std::move(paddings)) - .AddOutput(*out); - - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class PadGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* d_out = - context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - auto paddings = context.Attr>("paddings"); - - d_x->mutable_data(context.GetPlace()); - - auto d_x_dims = d_x->dims(); - auto size = phi::vectorize(d_x_dims); - std::vector offsets(0); - int i = 0; - for (auto iter = paddings.begin(); iter < paddings.end(); ++iter, ++i) { - if (i % 2 == 0) { - offsets.push_back(*iter); - } - } - - auto stream = - context.template device_context() - .stream(); - - const auto& runner = NpuOpRunner( - "SliceD", {*d_out}, {*d_x}, {{"offsets", offsets}, {"size", size}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(pad, - ops::PadNPUKernel, - ops::PadNPUKernel, - ops::PadNPUKernel); - -REGISTER_OP_NPU_KERNEL(pad_grad, - ops::PadGradNPUKernel, - ops::PadGradNPUKernel); diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc deleted file mode 100644 index e14c55a63642a..0000000000000 --- a/paddle/fluid/operators/pool_op_npu.cc +++ /dev/null @@ -1,334 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/pooling.h" - -namespace paddle { -namespace operators { - -template -class NPUPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - const Tensor *in_x = ctx.Input("X"); - Tensor *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - - bool global_pooling = ctx.Attr("global_pooling"); - bool ceil_mode = ctx.Attr("ceil_mode"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - auto in_x_dims = in_x->dims(); - auto out_dims = out->dims(); - framework::DDim data_dims; - framework::DDim out_data_dims; - - Tensor in_x_tensor, out_tensor; - in_x_tensor.ShareDataWith(*in_x); - out_tensor.ShareDataWith(*out); - std::vector ksize_vec(4, 1); - std::vector strides_vec(4, 1); - - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1); - ksize_vec[1] = ksize[0]; - ksize_vec[2] = ksize[1]; - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - in_x_tensor.set_layout(DataLayout::kNHWC); - out_tensor.set_layout(DataLayout::kNHWC); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size()); - ksize_vec[2] = ksize[0]; - ksize_vec[3] = ksize[1]; - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - } - phi::funcs::UpdatePadding(&paddings, - global_pooling, - adaptive, - padding_algorithm, - data_dims, - strides, - ksize); -#if (CANN_VERSION_CODE < 512000) - PADDLE_ENFORCE_LT( - std::max(paddings[0], paddings[1]), - ksize[0], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.", - ksize[0], - std::max(paddings[0], paddings[1]))); - PADDLE_ENFORCE_LT( - std::max(paddings[2], paddings[3]), - ksize[1], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", - ksize[1], - std::max(paddings[2], paddings[3]))); -#endif - if (adaptive) { - std::string pooling_mode = "AdaptiveAvgPool2d"; - if (pooling_type == "max") { - pooling_mode = "AdaptiveMaxPool2d"; - } - - // AdaptiveAvgPool2d only support NCHW - Tensor transformed_input, transformed_output; - if (pooling_type == "avg" && channel_last) { - transformed_input.mutable_data( - phi::make_dim( - in_x_dims[0], in_x_dims[3], in_x_dims[1], in_x_dims[2]), - ctx.GetPlace()); - transformed_output.mutable_data( - phi::make_dim(out_dims[0], out_dims[3], out_dims[1], out_dims[2]), - ctx.GetPlace()); - - const auto &trans_runner = - NpuOpRunner("TransData", - {in_x_tensor}, - {transformed_input}, - {{"src_format", std::string("NHWC")}, - {"dst_format", std::string("NCHW")}}); - trans_runner.Run(dev_ctx.stream()); - } else { - transformed_input.ShareDataWith(in_x_tensor); - transformed_output.ShareDataWith(out_tensor); - } - - const auto &runner = - NpuOpRunner(pooling_mode, - {transformed_input}, - {transformed_output}, - {{"output_size", phi::vectorize(out_data_dims)}}); - runner.Run(dev_ctx.stream()); - - if (pooling_type == "avg" && channel_last) { - const auto &trans_runner = - NpuOpRunner("TransData", - {transformed_output}, - {out_tensor}, - {{"src_format", std::string("NCHW")}, - {"dst_format", std::string("NHWC")}}); - trans_runner.Run(dev_ctx.stream()); - } - } else { - std::string pooling_mode = "AvgPoolV2"; - if (pooling_type == "max") { - PADDLE_ENFORCE_EQ( - exclusive, - true, - platform::errors::InvalidArgument( - "MaxPool only support exclusive=false, but got true")); - pooling_mode = "MaxPoolV3"; - } - - const auto &runner = - NpuOpRunner(pooling_mode, - {in_x_tensor}, - {out_tensor}, - {{"ksize", ksize_vec}, - {"strides", strides_vec}, - {"padding_mode", std::string("CALCULATED")}, - {"pads", paddings}, - {"data_format", data_format}, - {"global_pooling", global_pooling}, - {"ceil_mode", ceil_mode}, - {"exclusive", exclusive}}); - runner.Run(dev_ctx.stream()); - } - } -}; - -template -class NPUPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto &dev_ctx = ctx.template device_context(); - const Tensor *in_x = ctx.Input("X"); - const Tensor *out = ctx.Input("Out"); - const Tensor *out_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *in_x_grad = - ctx.Output(framework::GradVarName("X")); - in_x_grad->mutable_data(ctx.GetPlace()); - - std::string pooling_type = ctx.Attr("pooling_type"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - bool ceil_mode = ctx.Attr("ceil_mode"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - const bool channel_last = data_format == "NHWC"; - - // update paddings - auto in_x_dims = in_x->dims(); - auto out_dims = out->dims(); - framework::DDim data_dims; - framework::DDim out_data_dims; - std::vector ksize_vec(4, 1); - std::vector strides_vec(4, 1); - - Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor; - in_x_tensor.ShareDataWith(*in_x); - out_tensor.ShareDataWith(*out); - out_grad_tensor.ShareDataWith(*out_grad); - in_x_grad_tensor.ShareDataWith(*in_x_grad); - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1); - ksize_vec[1] = ksize[0]; - ksize_vec[2] = ksize[1]; - strides_vec[1] = strides[0]; - strides_vec[2] = strides[1]; - in_x_tensor.set_layout(DataLayout::kNHWC); - out_tensor.set_layout(DataLayout::kNHWC); - out_grad_tensor.set_layout(DataLayout::kNHWC); - in_x_grad_tensor.set_layout(DataLayout::kNHWC); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size()); - ksize_vec[2] = ksize[0]; - ksize_vec[3] = ksize[1]; - strides_vec[2] = strides[0]; - strides_vec[3] = strides[1]; - } - phi::funcs::UpdatePadding(&paddings, - global_pooling, - adaptive, - padding_algorithm, - data_dims, - strides, - ksize); -#if (CANN_VERSION_CODE < 512000) - PADDLE_ENFORCE_LT( - std::max(paddings[0], paddings[1]), - ksize[0], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.", - ksize[0], - std::max(paddings[0], paddings[1]))); - PADDLE_ENFORCE_LT( - std::max(paddings[2], paddings[3]), - ksize[1], - platform::errors::InvalidArgument( - "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.", - ksize[1], - std::max(paddings[2], paddings[3]))); -#endif - if (adaptive || (global_pooling && pooling_type == "max")) { - PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0], - 0, - platform::errors::InvalidArgument( - "When adaptive = True, H and W must be divisible, " - "but input dims is %s, output dims is %s", - data_dims, - out_data_dims)); - PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1], - 0, - platform::errors::InvalidArgument( - "When adaptive = True, H and W must be divisible, " - "but input dims is %s, output dims is %s", - data_dims, - out_data_dims)); - if (channel_last) { - strides_vec[1] = data_dims[0] / out_data_dims[0]; - strides_vec[2] = data_dims[1] / out_data_dims[1]; - ksize_vec[1] = strides_vec[1]; - ksize_vec[2] = strides_vec[2]; - } else { - strides_vec[2] = data_dims[0] / out_data_dims[0]; - strides_vec[3] = data_dims[1] / out_data_dims[1]; - ksize_vec[2] = strides_vec[2]; - ksize_vec[3] = strides_vec[3]; - } - } - - NPUAttributeMap attrs = {{"ksize", ksize_vec}, - {"strides", strides_vec}, - {"padding_mode", std::string("CALCULATED")}, - {"pads", paddings}, - {"data_format", data_format}, - {"global_pooling", global_pooling}, - {"ceil_mode", ceil_mode}, - {"exclusive", exclusive}}; - - if (pooling_type == "max") { - if (global_pooling) { - for (auto &s : strides_vec) { - s = 1; - } - PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]), - 255, - platform::errors::InvalidArgument( - "MaxPoolGrad H, W must be less than 255 when " - "global_pooling = True, but got %s", - data_dims)); - attrs["global_pooling"] = false; - } - - const auto &runner = - NpuOpRunner("MaxPoolV3Grad", - {in_x_tensor, out_tensor, out_grad_tensor}, - {in_x_grad_tensor}, - attrs); // 0: floor, 1: ceil - runner.Run(dev_ctx.stream()); - } else if (pooling_type == "avg") { - PADDLE_ENFORCE(strides[0] == strides[1], - platform::errors::InvalidArgument( - "AvgPoolGrad dose not support Asymmetric strides. but " - "strides = (%d, %d)", - strides[0], - strides[1])); - - NpuOpRunner runner; - runner.SetType("AvgPoolV2Grad"); - runner.AddInput(phi::vectorize(in_x->dims())); - runner.AddInput(out_grad_tensor); - runner.AddOutput(in_x_grad_tensor); - runner.AddAttrs(attrs); - runner.Run(dev_ctx.stream()); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(pool2d, - ops::NPUPoolOpKernel, - ops::NPUPoolOpKernel); -REGISTER_OP_NPU_KERNEL(pool2d_grad, - ops::NPUPoolGradOpKernel, - ops::NPUPoolGradOpKernel); diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc deleted file mode 100644 index fd03ce027bda5..0000000000000 --- a/paddle/fluid/operators/randperm_op_npu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/randperm_op.h" - -template -using kernel = - paddle::operators::RandpermKernel; - -REGISTER_OP_NPU_KERNEL( - randperm, kernel, kernel, kernel, kernel); diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc deleted file mode 100644 index b2266608d7dca..0000000000000 --- a/paddle/fluid/operators/range_op_npu.cc +++ /dev/null @@ -1,81 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/range_op.h" - -namespace paddle { -namespace operators { - -template -class RangeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* start_t = context.Input("Start"); - auto* end_t = context.Input("End"); - auto* step_t = context.Input("Step"); - auto* out = context.Output("Out"); - - phi::DenseTensor n; - framework::TensorCopy( - *start_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T start = n.data()[0]; - framework::TensorCopy( - *end_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T end = n.data()[0]; - framework::TensorCopy( - *step_t, - platform::CPUPlace(), - context.template device_context(), - &n); - context.template device_context() - .Wait(); - T step = n.data()[0]; - - int64_t size = 0; - GetSize(start, end, step, &size); - - out->Resize(phi::make_ddim({size})); - out->mutable_data(context.GetPlace()); - - std::vector odata; - T value = start; - for (int64_t i = 0; i < size; ++i) { - odata.push_back(value); - value += step; - } - - framework::TensorFromVector(odata, context.device_context(), out); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL(range, - paddle::operators::RangeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::RangeNPUKernel, -#endif - paddle::operators::RangeNPUKernel, - paddle::operators::RangeNPUKernel) diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc deleted file mode 100644 index 068d5d6be12cd..0000000000000 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(range); -USE_OP_DEVICE_KERNEL(range, NPU); - -template -void Compare(f::Scope* scope, - const p::DeviceContext& ctx, - std::string op_type) { - // init - auto start = scope->Var("Start"); - auto tensor_start = start->GetMutable(); - std::vector init_start; - init_start.push_back(static_cast(1)); - paddle::framework::TensorFromVector(init_start, ctx, tensor_start); - tensor_start->Resize({1}); - - auto end = scope->Var("End"); - auto tensor_end = end->GetMutable(); - std::vector init_end; - init_end.push_back(static_cast(10)); - paddle::framework::TensorFromVector(init_end, ctx, tensor_end); - tensor_end->Resize({1}); - - auto step = scope->Var("Step"); - auto tensor_step = step->GetMutable(); - std::vector init_step; - init_step.push_back(static_cast(2)); - paddle::framework::TensorFromVector(init_step, ctx, tensor_step); - tensor_step->Resize({1}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - auto op = f::OpRegistry::CreateOp( - op_type, - {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}}, - {{"Out", {"Out"}}}, - {}); - - op->Run(*scope, place); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - ctx.Wait(); - - EXPECT_EQ(static_cast(out_vec.size()), static_cast(5)); - EXPECT_EQ(static_cast(out_vec[0]), static_cast(1.0)); - EXPECT_EQ(static_cast(out_vec[1]), static_cast(3.0)); - EXPECT_EQ(static_cast(out_vec[2]), static_cast(5.0)); - EXPECT_EQ(static_cast(out_vec[3]), static_cast(7.0)); - EXPECT_EQ(static_cast(out_vec[4]), static_cast(9.0)); -} - -TEST(range, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx, "range"); -} diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 7c2f91999e964..27a2ff68d3aad 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -33,10 +33,3 @@ if(WITH_ROCM) SRCS check_reduce_rank_test.cu DEPS tensor) endif() - -if(WITH_ASCEND_CL) - cc_test( - reduce_any_op_npu_test - SRCS reduce_any_op_npu_test.cc - DEPS op_registry reduce_any_op scope device_context enforce executor) -endif() diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc deleted file mode 100644 index 7ec3183d412d4..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" - -namespace paddle { -namespace operators { - -template -class ReduceAnyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const phi::DenseTensor* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - bool keep_dim = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - out->mutable_data(ctx.GetPlace()); - - // set attr - NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}}; - - const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc deleted file mode 100644 index aec1640181bcc..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(reduce_any); -USE_OP_DEVICE_KERNEL(reduce_any, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - std::vector init_x = {true, false, false, false}; - f::TensorFromVector(init_x, ctx, tensor_x); - tensor_x->Resize(phi::make_ddim({2})); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - // run - std::vector axes; - f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}}; - auto op = f::OpRegistry::CreateOp( - "reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - - ctx.Wait(); - - std::vector out_vec; - f::TensorToVector(*tensor_out, ctx, &out_vec); - - ctx.Wait(); - - std::vector expected_vec = {true}; - EXPECT_EQ(out_vec.size(), expected_vec.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], expected_vec[i]); - } -} - -TEST(reduce_any, NPU) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc deleted file mode 100644 index de4049c7e7f97..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc +++ /dev/null @@ -1,216 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - const auto& dev_ctx = - ctx.template device_context(); - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT64) { - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = - NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs); - runner.Run(dev_ctx.stream()); - }; - - NpuOpRunner::TypeAdapter({*x}, - {cast_out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = - NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input); - runner.Run(dev_ctx.stream()); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(dev_ctx.stream()); - } - } -}; - -template -class ReduceMaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Input("Out"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto reduce_dims = context.Attr>("dim"); - bool reduce_all = context.Attr("reduce_all"); - int in_dtype = context.Attr("in_dtype"); - - PADDLE_ENFORCE_EQ( - in_dtype == -1, - true, - platform::errors::InvalidArgument( - "NPU only support in_dtype == -1 in reduce_max_grad op.")); - - auto* x_grad = - context.Output(framework::GradVarName("X")); - x_grad->mutable_data(context.GetPlace()); - - auto& dev_ctx = - context.template device_context(); - auto place = context.GetPlace(); - auto stream = dev_ctx.stream(); - - // broadcast - auto x_dims_vec = phi::vectorize(x->dims()); - if (reduce_all) { - reduce_dims.clear(); - for (size_t d = 0; d < x_dims_vec.size(); ++d) { - reduce_dims.push_back(static_cast(d)); - } - } - - phi::DenseTensor tmp_out, tmp_out_grad; - auto tmp_out_dims_vec = x_dims_vec; - for (auto d : reduce_dims) { - if (d < 0) { - d += x_dims_vec.size(); - } - tmp_out_dims_vec[d] = 1; - } - - tmp_out.ShareDataWith(*out); - tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec)); - tmp_out_grad.ShareDataWith(*out_grad); - tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec)); - - phi::DenseTensor transformed_out(x->type()); - transformed_out.Resize(phi::make_ddim(x_dims_vec)); - transformed_out.mutable_data(place); - NpuOpRunner r_brd_out; - r_brd_out.SetType("BroadcastTo") - .AddInput(tmp_out) - .AddInput(std::move(x_dims_vec)) - .AddOutput(transformed_out) - .Run(stream); - phi::DenseTensor transformed_out_grad(x->type()); - transformed_out_grad.Resize(phi::make_ddim(x_dims_vec)); - transformed_out_grad.mutable_data(place); - NpuOpRunner r_brd_out_grad; - r_brd_out_grad.SetType("BroadcastTo") - .AddInput(tmp_out_grad) - .AddInput(std::move(x_dims_vec)) - .AddOutput(transformed_out_grad) - .Run(stream); - - // compare - phi::DenseTensor equal_cond; - equal_cond.mutable_data(x_grad->dims(), place); - const auto& r_equal = - NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {}); - r_equal.Run(stream); - - // select - phi::DenseTensor t_zero; - t_zero.mutable_data(x_grad->dims(), place); - FillNpuTensorWithConstant(&t_zero, static_cast(0)); - t_zero.Resize(x_grad->dims()); - - const auto& r_sel = NpuOpRunner( - "SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {}); - r_sel.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_max, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel, - ops::ReduceMaxNPUKernel); -REGISTER_OP_NPU_KERNEL( - reduce_max_grad, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel, - ops::ReduceMaxGradNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc deleted file mode 100644 index 65fabbd21cb7e..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_npu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" - -namespace paddle { -namespace operators { - -template -class NPUReduceMeanOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - - bool reduce_all = ctx.Attr("reduce_all"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - - auto input_dims = input->dims(); - if (reduce_all) { - dims.clear(); - for (int i = 0; i < input_dims.size(); i++) { - dims.push_back(static_cast(i)); - } - } - - auto stream = - ctx.template device_context() - .stream(); - - NpuOpRunner runner; - runner.SetType("ReduceMean") - .AddInput(*input) - .AddInput(std::move(dims)) - .AddOutput(*output) - .AddAttrs({{"keep_dims", keep_dim}}) - .Run(stream); - } -}; - -template -class NPUReduceMeanGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - - bool reduce_all = ctx.Attr("reduce_all"); - auto reduce_dims = ctx.Attr>("dim"); - auto input_dims = input->dims(); - - int reduce_numel = 1; - if (reduce_all) { - reduce_dims.clear(); - for (int d = 0; d < input_dims.size(); ++d) { - reduce_dims.push_back(static_cast(d)); - } - } - for (auto& d : reduce_dims) { - if (d < 0) { - d = d + input_dims.size(); - } - reduce_numel *= input_dims[d]; - } - - phi::DenseTensor tensor_value(input_grad->dtype()); - tensor_value.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant( - &tensor_value, static_cast(1.0f / static_cast(reduce_numel))); - - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - runner.SetType("Fill") - .AddInput(phi::vectorize(input_dims)) - .AddInput(tensor_value) - .AddOutput(*input_grad) - .Run(stream); - - phi::DenseTensor transformed_input_grad, transformed_out_grad; - phi::DenseTensor tmp_output_grad; - auto tmp_output_dims = input_dims; - for (auto d : reduce_dims) { - tmp_output_dims[d] = 1; - } - tmp_output_grad.ShareDataWith(*output_grad); - tmp_output_grad.Resize(tmp_output_dims); - auto& dev_ctx = - ctx.template device_context(); - NpuElementWiseOpBroadcast(dev_ctx, - input_grad, - &tmp_output_grad, - 0, - &transformed_input_grad, - &transformed_out_grad); - const auto& runner2 = - NpuOpRunner("Mul", - {transformed_input_grad, transformed_out_grad}, - {*input_grad}, - {}); - runner2.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel); -REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc deleted file mode 100644 index e4adc42283120..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceMinNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - const auto& dev_ctx = - ctx.template device_context(); - if (x->dtype() == phi::DataType::INT64) { - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = - NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs); - runner.Run(dev_ctx.stream()); - }; - - NpuOpRunner::TypeAdapter({*x}, - {cast_out}, - attr_input, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = - NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input); - runner.Run(dev_ctx.stream()); - } - - if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_min, - ops::ReduceMinNPUKernel, - ops::ReduceMinNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceMinNPUKernel, -#endif - ops::ReduceMinNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc deleted file mode 100644 index fd9bf28b60793..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceProdNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto dims = ctx.Attr>("dim"); - bool keep_dim = ctx.Attr("keep_dim"); - bool reduce_all = ctx.Attr("reduce_all"); - int out_dtype = ctx.Attr("out_dtype"); - - auto place = ctx.GetPlace(); - - phi::DenseTensor cast_out(x->type()); - cast_out.Resize(out->dims()); - cast_out.mutable_data(place); - - auto cast_out_dtype = framework::TransToProtoVarType(x->dtype()); - if (out_dtype != -1) { - cast_out_dtype = static_cast(out_dtype); - } - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - if (cast_out_dtype == framework::proto::VarType::FP32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT16) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT32) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::INT64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::FP64) { - out->mutable_data(place); - } else if (cast_out_dtype == framework::proto::VarType::BOOL) { - out->mutable_data(place); - } - } else { - out->ShareDataWith(cast_out); - } - - framework::NPUAttributeMap attr_input = {{"axes", dims}, - {"keep_dims", keep_dim}}; - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; - } - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input); - runner.Run(stream); - - if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) { - auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - reduce_prod, - ops::ReduceProdNPUKernel, - ops::ReduceProdNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc deleted file mode 100644 index 0c6665494ece7..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/unsqueeze_op.h" - -namespace paddle { -namespace operators { - -template -class ReduceSumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - bool reduce_all = ctx.Attr("reduce_all"); - bool keep_dims = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - out->mutable_data(ctx.GetPlace()); - - // special case - if (x->dims().size() == 1 && keep_dims == false) { - keep_dims = true; - } - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor cast_x; - phi::DenseTensor cast_out; - // NOTE: ReduceSumD only supports fp32 and fp16 - if (framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP32 && - framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP16) { - cast_x.Resize(x->dims()); - cast_x.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); - const auto& runner_cast = NpuOpRunner( - "Cast", {*x}, {cast_x}, {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - - cast_out.Resize(out->dims()); - cast_out.mutable_data(ctx.GetPlace()); - } else { - cast_x.ShareDataWith(*x); - cast_out.ShareDataWith(*out); - } - - if (reduce_all) { - std::vector dim_vec; - for (int i = 0; i < x->dims().size(); i++) { - dim_vec.push_back(i); - } - - const auto& runner = - NpuOpRunner("ReduceSumD", - {cast_x}, - {cast_out}, - {{"axes", dim_vec}, {"keep_dims", keep_dims}}); - runner.Run(stream); - - } else { - const auto& runner = - NpuOpRunner("ReduceSumD", - {cast_x}, - {cast_out}, - {{"axes", dims}, {"keep_dims", keep_dims}}); - runner.Run(stream); - } - - if (framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP32 && - framework::TransToProtoVarType(x->dtype()) != - framework::proto::VarType::FP16) { - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype())); - const auto& runner_cast = - NpuOpRunner("Cast", - {cast_out}, - {*out}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast.Run(stream); - } - } -}; - -template -class ReduceSumGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - bool reduce_all = ctx.Attr("reduce_all"); - bool keep_dims = ctx.Attr("keep_dim"); - auto dims = ctx.Attr>("dim"); - - x_grad->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - if (keep_dims || reduce_all) { - const auto& runner = NpuOpRunner("BroadcastToD", - {*out_grad}, - {*x_grad}, - {{"shape", phi::vectorize(x->dims())}}); - runner.Run(stream); - } else { - framework::DDim out_dims; - out_dims = UnsqueezeKernel::GetOutputShape( - dims, out_grad->dims()); - - phi::DenseTensor out_grad_tmp(out_grad->type()); - out_grad_tmp.Resize(out_dims); - out_grad_tmp.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *out_grad, - ctx.GetPlace(), - ctx.template device_context(), - &out_grad_tmp); - out_grad_tmp.Resize(out_dims); - - const auto& runner = NpuOpRunner("BroadcastToD", - {out_grad_tmp}, - {*x_grad}, - {{"shape", phi::vectorize(x->dims())}}); - runner.Run(stream); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - reduce_sum, - ops::ReduceSumNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceSumNPUKernel, -#endif - ops::ReduceSumNPUKernel, - ops::ReduceSumNPUKernel); -REGISTER_OP_NPU_KERNEL( - reduce_sum_grad, - ops::ReduceSumGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::ReduceSumGradNPUKernel, -#endif - ops::ReduceSumGradNPUKernel, - ops::ReduceSumGradNPUKernel); diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc deleted file mode 100644 index 2d4497a19e77b..0000000000000 --- a/paddle/fluid/operators/reshape_op_npu.cc +++ /dev/null @@ -1,167 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/tensor_utils.h" - -namespace paddle { -namespace operators { - -template -class Reshape2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto stream = - ctx.template device_context() - .stream(); - auto place = ctx.GetPlace(); - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - std::vector target_shape_vector; - auto shape_tensor_vector = ctx.MultiInput("ShapeTensor"); - if (shape_tensor_vector.size() > 0) { - for (auto* shape_tensor : shape_tensor_vector) { - PADDLE_ENFORCE_EQ( - shape_tensor->dims().size(), - 1, - platform::errors::InvalidArgument( - "If the element type of 'shape' in Reshape Op is Tensor, " - "the element's shape must be [1]. But received the element's " - "shape is [%d]", - shape_tensor->dims().size())); - - target_shape_vector.push_back( - phi::GetVectorFromTensor(shape_tensor)[0]); - } - } else { - auto* shape_tensor = ctx.HasInput("Shape") - ? ctx.Input("Shape") - : nullptr; - if (shape_tensor) { - target_shape_vector = phi::GetVectorFromTensor(shape_tensor); - } else { - target_shape_vector = ctx.Attr>("shape"); - PADDLE_ENFORCE_GT( - target_shape_vector.size(), - 0, - platform::errors::InvalidArgument( - "The length of shape attribute should be larger than 0 when " - "input ShapeTensor and Shape are empty!")); - } - } - - int num_negative = - std::count(target_shape_vector.begin(), target_shape_vector.end(), -1); - PADDLE_ENFORCE_LE( - num_negative, - 1, - platform::errors::InvalidArgument( - "The max number of -1 in shape attribute or shape tensor is 1 " - "but received %d.", - num_negative)); - auto it_zero = - std::find(target_shape_vector.begin(), target_shape_vector.end(), 0); - if (it_zero != target_shape_vector.end()) { - int x_rank = x->dims().size(); - for (size_t i = 0; i < target_shape_vector.size(); i++) { - if (target_shape_vector[i] == 0) { - PADDLE_ENFORCE_LT( - i, - x_rank, - platform::errors::InvalidArgument( - "The index of 0 in shape attribute or shape tensor", - "should be less than input dim size, ", - "but the index is %d and input dim size is %d", - i, - x_rank)); - target_shape_vector[i] = x->dims().at(i); - } - } - } - - auto it = - std::find(target_shape_vector.begin(), target_shape_vector.end(), -1); - if (it != target_shape_vector.end()) { - auto ddim_out_vec = phi::vectorize(x->dims()); - int ddim_out_product = std::accumulate( - ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies()); - int reshape_out_product = std::accumulate(target_shape_vector.begin(), - target_shape_vector.end(), - -1, - std::multiplies()); - int index = std::distance(target_shape_vector.begin(), it); - target_shape_vector[index] = ddim_out_product / reshape_out_product; - } - - auto out_dims = phi::make_ddim(target_shape_vector); - out->mutable_data(out_dims, place); - - NpuOpRunner runner; - // the shape input must be on the host side - runner.SetType("Reshape") - .AddInput(*x) - .AddInput(std::vector(target_shape_vector)) - .AddOutput(*out) - .AddAttr("axis", 0) - .AddAttr("num_axes", -1); - runner.Run(stream); - } -}; - -template -class Reshape2GradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto in_dims = d_x->dims(); - - d_x->mutable_data(ctx.GetPlace(), d_out->type()); - framework::TensorCopy( - *d_out, - ctx.GetPlace(), - ctx.template device_context(), - d_x); - d_x->Resize(in_dims); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - reshape2, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel, - ops::Reshape2NPUKernel); -REGISTER_OP_NPU_KERNEL( - reshape2_grad, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel, - ops::Reshape2GradNPUKernel); diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc deleted file mode 100644 index 7d15dc2a46558..0000000000000 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class ROIAlignNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); // (B,C,H,W) - auto* ROIs = ctx.Input("ROIs"); // (N,4) - auto* ROIsNum = ctx.Input("RoisNum"); // [0 1 1 2 2 2] - auto* Out = ctx.Output("Out"); - Out->mutable_data(ctx.GetPlace()); - - auto spatial_scale = ctx.Attr("spatial_scale"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto sample_num = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - auto roi_end_mode = 0; - PADDLE_ENFORCE_EQ( - aligned, - false, - platform::errors::InvalidArgument( - "ROIAlignNPU only support Aligned attribute equaled to False")); - - framework::NPUAttributeMap attr_roi = {{"spatial_scale", spatial_scale}, - {"pooled_height", pooled_height}, - {"pooled_width", pooled_width}, - {"sample_num", sample_num}, - {"roi_end_mode", roi_end_mode}}; - - auto stream = - ctx.template device_context() - .stream(); - - // Combine *ROIsNum with ROIs to get new ROIs - // change roisnum's datatype & resize - int dtype = - static_cast(ConvertToNpuDtype(framework::proto::VarType::FP32)); - framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}}; - phi::DenseTensor ROIsNum_fp(ROIs->dtype()); - ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1})); - ROIsNum_fp.mutable_data(ctx.GetPlace()); - - const auto& runner_c = - NpuOpRunner("Cast", {*ROIsNum}, {ROIsNum_fp}, attr_cast); - runner_c.Run(stream); - - // concate to make (N, 5) - std::vector x_list; - x_list.push_back(ROIsNum_fp); - x_list.push_back(*ROIs); - auto axis = 1; - // output of concate - phi::DenseTensor ROIs_N5(ROIs->dtype()); - ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5})); - ROIs_N5.mutable_data(ctx.GetPlace()); - - // attribute of concate - auto EleNum = 2; - framework::NPUAttributeMap attr_concat = {{"N", EleNum}, - {"concat_dim", axis}}; - - NpuOpRunner runner0; - runner0.SetType("ConcatD") - .AddInputs(x_list) - .AddOutput(ROIs_N5) - .AddInputNames({"x0", "x1"}) - .AddAttrs(attr_concat); - runner0.Run(stream); - - const auto& runner = - NpuOpRunner("ROIAlign", {*X, ROIs_N5}, {*Out}, attr_roi); - runner.Run(stream); - } -}; - -template -class ROIAlignNPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sample_num = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - if (!in_grad) { - return; - } - in_grad->mutable_data(place); - - PADDLE_ENFORCE_EQ( - aligned, - false, - platform::errors::InvalidArgument( - "ROIAlignGradNPU only support Aligned attribute equaled to False")); - PADDLE_ENFORCE_EQ( - ctx.HasInput("RoisNum"), - true, - platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp " - "is not found while using NPU.")); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rois->dtype()), - framework::proto::VarType::FP32, - platform::errors::InvalidArgument( - "ROIAlignGradNPU only support ROIs type equaled to FP32.")); - - // Cast RoisNum to fp32 tensor - auto* RoisNum = ctx.Input("RoisNum"); - phi::DenseTensor ROIs_N5; - ROIs_N5.mutable_data({rois_num, 5}, place); - phi::DenseTensor ROIsNum_fp; - ROIsNum_fp.mutable_data(RoisNum->dims(), place); // shape = [rois_num] - int nputype_fp32 = - static_cast(ConvertToNpuDtype(framework::proto::VarType::FP32)); - const auto& runner_cast = NpuOpRunner( - "Cast", {*RoisNum}, {ROIsNum_fp}, {{"dst_type", nputype_fp32}}); - runner_cast.Run(stream); - ROIsNum_fp.Resize({rois_num, 1}); - - // Combine *ROIsNum with ROIs to get new ROIs - std::vector x_list; - x_list.push_back(ROIsNum_fp); - x_list.push_back(*rois); - const auto& runner_concat = NpuOpRunner( - "ConcatD", {x_list}, {ROIs_N5}, {{"N", 2}, {"concat_dim", 1}}); - runner_concat.Run(stream); - - // If CANN version code is less than 504, by analysis, in order to match - // cpu grad version, rois[:,3:5] should substrate 1 before call ascend grad - // function -#if (CANN_VERSION_CODE < 504000) - std::vector vec_dlt = {0, 0, 0, -1.0f, -1.0f}; - phi::DenseTensor tsr_dlt; - tsr_dlt.mutable_data({5}, place); - framework::TensorFromVector(vec_dlt, ctx.device_context(), &tsr_dlt); - ctx.template device_context().Wait(); - const auto& runner_add = - NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {}); - runner_add.Run(stream); -#endif - - // Call ascend RoiAlignGrad function - int roi_end_mode = 0; - const auto& runner_roi_align_grad = - NpuOpRunner("ROIAlignGrad", - {*out_grad, ROIs_N5}, - {*in_grad}, - {{"xdiff_shape", phi::vectorize(in_dims)}, - {"pooled_width", pooled_width}, - {"pooled_height", pooled_height}, - {"spatial_scale", spatial_scale}, - {"sample_num", sample_num}, - {"roi_end_mode", roi_end_mode}}); - runner_roi_align_grad.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - roi_align, - ops::ROIAlignNPUKernel, - ops::ROIAlignNPUKernel, - ops::ROIAlignNPUKernel); - -REGISTER_OP_NPU_KERNEL(roi_align_grad, - ops::ROIAlignNPUGradKernel, - ops::ROIAlignNPUGradKernel, - ops::ROIAlignNPUGradKernel); diff --git a/paddle/fluid/operators/run_program_op_npu.cc b/paddle/fluid/operators/run_program_op_npu.cc deleted file mode 100644 index e45ce0a2bef9f..0000000000000 --- a/paddle/fluid/operators/run_program_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/sampling_id_op_npu.cc b/paddle/fluid/operators/sampling_id_op_npu.cc deleted file mode 100644 index 5657edcfa35bb..0000000000000 --- a/paddle/fluid/operators/sampling_id_op_npu.cc +++ /dev/null @@ -1,20 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sampling_id_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(sampling_id, - paddle::operators::SamplingIdKernel, - paddle::operators::SamplingIdKernel); diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc deleted file mode 100644 index 1fb136a5110db..0000000000000 --- a/paddle/fluid/operators/save_combine_op_npu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/save_combine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - save_combine, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc deleted file mode 100644 index d6063d66f1531..0000000000000 --- a/paddle/fluid/operators/save_op_npu.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/save_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - save, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc deleted file mode 100644 index c25a49c4f3b60..0000000000000 --- a/paddle/fluid/operators/scale_op_npu.cc +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -static inline T GetAttrFromTensor(const phi::DenseTensor* tensor) { - const auto* tensor_data = tensor->data(); - phi::DenseTensor cpu_tensor; - if (platform::is_gpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - paddle::framework::TensorCopySync( - *tensor, platform::CPUPlace(), &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -template -class ScaleNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - auto scale = ctx.Attr("scale"); - auto bias = ctx.Attr("bias"); - auto bias_after_scale = ctx.Attr("bias_after_scale"); - auto stream = - ctx.template device_context() - .stream(); - float power = 1.0; - VLOG(4) << "scale:" << scale << ", bias:" << bias - << " ,bias_after_scale:" << bias_after_scale; - if (ctx.HasInput("ScaleTensor")) { - auto* scale_tensor = ctx.Input("ScaleTensor"); - scale = static_cast(GetAttrFromTensor(scale_tensor)); - } - if (isinf(scale)) { - if (signbit(scale)) { - scale = -std::numeric_limits::max(); - } else { - scale = std::numeric_limits::max(); - } - } - if (!bias_after_scale) { - bias *= scale; - } - out->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attrs = { - {"power", power}, {"scale", scale}, {"shift", bias}}; - const auto& dev_ctx = - ctx.template device_context(); - auto op_func = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& muls_runner = NpuOpRunner( - "Muls", {inputs[0]}, {outputs[0]}, {{"value", attrs.at("scale")}}); - muls_runner.Run(dev_ctx.stream()); - - const auto& adds_runner = NpuOpRunner( - "Adds", {outputs[0]}, {outputs[0]}, {{"value", attrs.at("shift")}}); - adds_runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT32) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attrs, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::INT64) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attrs, - dev_ctx, - op_func, - {framework::proto::VarType::INT32}, - {framework::proto::VarType::INT32}); - } else { - const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - scale, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel, - paddle::operators::ScaleNPUKernel); diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc deleted file mode 100644 index b2b09faaa9d44..0000000000000 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc deleted file mode 100644 index 1843e993d552a..0000000000000 --- a/paddle/fluid/operators/seed_op_npu.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/seed_op.h" - -namespace paddle { -namespace operators { - -template -class NPUSeedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Output("Out"); - int user_seed = ctx.Attr("seed"); - std::random_device rnd; - int seed; - - if (user_seed != 0) { - seed = user_seed; - } else { - seed = rnd(); - } - - out->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(out, seed); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - seed, ops::NPUSeedKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc deleted file mode 100644 index 3978923d46af7..0000000000000 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h" - -namespace paddle { -namespace operators { - -template -class SequenceMaskNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Output("Y"); - int maxlen = ctx.Attr("maxlen"); - - if (ctx.HasInput("MaxLenTensor")) { - auto max_len_tensor = ctx.Input("MaxLenTensor"); - PADDLE_ENFORCE_NOT_NULL(max_len_tensor, - platform::errors::InvalidArgument( - "Input(MaxLenTensor) should not be NULL." - "But received Input(MaxLenTensor) is NULL")); - phi::DenseTensor temp; - paddle::framework::TensorCopySync( - *max_len_tensor, platform::CPUPlace(), &temp); - maxlen = *temp.data(); - PADDLE_ENFORCE_GT( - maxlen, - 0, - platform::errors::InvalidArgument( - "Input(MaxLenTensor) value should be greater than 0. But " - "received Input(MaxLenTensor) value = %d.", - maxlen)); - } - - if (maxlen < 0) { - auto x_numel = x->numel(); - if (x_numel == 0) { - maxlen = 0; - } else { - std::vector x_vec; - framework::TensorToVector(*x, dev_ctx, &x_vec); - auto x_data = x_vec.data(); - maxlen = static_cast(*std::max_element(x_data, x_data + x_numel)); - } - } - auto y_dim = phi::vectorize(x->dims()); - y_dim.push_back(maxlen); - - phi::DenseTensor cast_x; - cast_x.mutable_data(x->dims(), ctx.GetPlace()); - const auto& cast1_runner = NpuOpRunner( - "Cast", - {*x}, - {cast_x}, - {{"dst_type", - ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}}); - cast1_runner.Run(dev_ctx.stream()); - - phi::DenseTensor tmp; - tmp.mutable_data(phi::make_ddim({maxlen}), ctx.GetPlace()); - NpuOpRunner range_runner; - range_runner.SetType("Range"); - range_runner.AddInput(std::vector({0})); - range_runner.AddInput(std::vector({maxlen})); - range_runner.AddInput(std::vector({1})); - range_runner.AddOutput(tmp); - range_runner.Run(dev_ctx.stream()); - - phi::DenseTensor expand_tmp; - expand_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& expand_runner = - NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}}); - expand_runner.Run(dev_ctx.stream()); - - auto x_dims = phi::vectorize(x->dims()); - x_dims.push_back(1); - cast_x.Resize(phi::make_ddim({x_dims})); - phi::DenseTensor x_tmp; - x_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& tile_runner = - NpuOpRunner("TileWithAxis", - {cast_x}, - {x_tmp}, - {{"axis", x->dims().size()}, {"tiles", maxlen}}); - tile_runner.Run(dev_ctx.stream()); - - phi::DenseTensor y_tmp; - y_tmp.mutable_data(phi::make_ddim(y_dim), ctx.GetPlace()); - const auto& less_runner = - NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {}); - less_runner.Run(dev_ctx.stream()); - - y->Resize(phi::make_ddim(y_dim)); - auto out_dtype = static_cast( - ctx.Attr("out_dtype")); - if (out_dtype == framework::proto::VarType::INT32) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::INT64) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::FP32) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::FP64) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::BOOL) { - y->mutable_data(ctx.GetPlace()); - } else if (out_dtype == framework::proto::VarType::UINT8) { - y->mutable_data(ctx.GetPlace()); - } else { - PADDLE_ENFORCE(false, - platform::errors::InvalidArgument( - "out_dtype only supporing int32, int64, fp32, fp64, " - "bool, uint8, but receive out_dtype is %d", - out_dtype)); - } - - const auto& cast2_runner = NpuOpRunner( - "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}}); - cast2_runner.Run(dev_ctx.stream()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - sequence_mask, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel, - ops::SequenceMaskNPUKernel); diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc deleted file mode 100644 index b572e98eb81e9..0000000000000 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/set_value_op.h" -#include "paddle/phi/kernels/funcs/slice_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class SetValueNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = - phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = - phi::funcs::GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - paddle::framework::TensorCopy(*in, ctx.GetPlace(), out); - - auto starts_indices = std::vector(in_dims.size(), 0); - auto ends_indices = std::vector(in_dims.size(), 0); - auto strides_indices = std::vector(in_dims.size(), 0); - - for (int i = 0; i < in_dims.size(); ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - } - - int64_t stride_step = phi::product(in_dims); - std::vector index_indices(1, 0); - for (size_t i = 0; i < strides_indices.size(); ++i) { - auto index_size = index_indices.size(); - stride_step /= in_dims[i]; - for (size_t j = 0; j < index_size; ++j) { - auto start_index = *index_indices.begin(); - if (strides_indices[i] > 0) { - for (int64_t k = starts_indices[i]; k < ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } else { - for (int64_t k = starts_indices[i]; k > ends_indices[i]; - k += strides_indices[i]) { - index_indices.push_back(start_index + k * stride_step); - } - } - index_indices.erase(index_indices.begin()); - } - } - - PADDLE_ENFORCE_EQ( - static_cast(index_indices.size()), - phi::product(slice_dims_for_assign), - platform::errors::InvalidArgument( - "OP(set_value) error index indices and value update not match ")); - - phi::DenseTensor value_t(in->type()); - if (value_tensor != nullptr) { - value_t.ShareDataWith(*value_tensor); - } else { - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, ctx.GetPlace()); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVectorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - } - - auto stream = ctx.template device_context().stream(); - - phi::DenseTensor value_temp(in->type()); - if (slice_dims_for_assign == value_t.dims()) { - value_temp.ShareDataWith(value_t); - } else { - value_temp.Resize(slice_dims_for_assign); - value_temp.mutable_data(ctx.GetPlace()); - NpuOpRunner runner_brd; - runner_brd.SetType("BroadcastTo") - .AddInput(value_t) - .AddInput(phi::vectorize(slice_dims_for_assign)) - .AddOutput(value_temp) - .Run(stream); - } - - int64_t input_numel = phi::product(in_dims); - int64_t index_numel = index_indices.size(); - - phi::DenseTensor in_temp, out_temp, val_temp; - in_temp.ShareDataWith(*in); - out_temp.ShareDataWith(*out); - val_temp.ShareDataWith(value_temp); - in_temp.Resize(phi::make_ddim({input_numel})); - out_temp.Resize(phi::make_ddim({input_numel})); - val_temp.Resize(phi::make_ddim({index_numel})); - - NpuOpRunner runner; - runner.SetType("ScatterUpdate") - .AddInput(in_temp) - .AddInput(std::move(index_indices)) - .AddInput(val_temp) - .AddOutput(out_temp) -#if (CANN_VERSION_CODE >= 504000) - .AddAttrs({{"use_locking", false}}) -#endif - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(set_value, - ops::SetValueNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::SetValueNPUKernel, -#endif - ops::SetValueNPUKernel) diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc deleted file mode 100644 index 76f4539e70b2f..0000000000000 --- a/paddle/fluid/operators/shape_op_npu.cc +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ShapeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("Input"); - auto* out_t = ctx.Output("Out"); - out_t->Resize({x->dims().size()}); - out_t->mutable_data(ctx.GetPlace()); - - // The output data type defaults to int32. - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - runner.SetType("Shape").AddInput(*x).AddOutput(*out_t).AddAttr( - "dtype", static_cast(dst_dtype)); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - shape, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel, - ops::ShapeNPUKernel); diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc deleted file mode 100644 index 4181db1d8e04c..0000000000000 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class ShardIndexNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - VLOG(4) << "start kernel"; - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - - PADDLE_ENFORCE_GT( - index_num, - 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, - 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, - 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, - nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, - shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - auto place = context.GetPlace(); - out->Resize(in->dims()); - out->set_lod(in->lod()); - out->mutable_data(place); - - phi::DenseTensor tmp(in->type()); - tmp.mutable_data(framework::DDim({1}), place); - FillNpuTensorWithConstant(&tmp, shard_size); - - phi::DenseTensor condition(phi::DataType::BOOL); - condition.mutable_data(in->dims(), place); - - phi::DenseTensor tmp2(in->type()); - tmp2.mutable_data(in->dims(), place); - - phi::DenseTensor tmp3(in->type()); - tmp3.mutable_data(in->dims(), place); - - auto stream = - context.template device_context() - .stream(); - - NpuOpRunner runner; - runner.AddInputs({*in, tmp}); - runner.AddOutputs({tmp2}); - runner.SetType("Mod"); - runner.Run(stream); - - NpuOpRunner runner1; - runner1.AddInputs({*in, tmp}); - runner1.AddOutputs({tmp3}); - runner1.SetType("FloorDiv"); - runner1.Run(stream); - - FillNpuTensorWithConstant(&tmp, shard_id); - NpuOpRunner runner2; - runner2.AddInputs({tmp3, tmp}); - runner2.AddOutputs({condition}); - runner2.SetType("Equal"); - runner2.Run(stream); - - phi::DenseTensor tmp4(in->type()); - tmp4.mutable_data(in->dims(), place); - FillNpuTensorWithConstant(&tmp4, ignore_value); - tmp4.Resize(in->dims()); - - NpuOpRunner runner3; - runner3.AddInputs({condition, tmp2, tmp4}); - runner3.AddOutputs({*out}); - runner3.SetType("Select"); - runner3.Run(stream); - } -}; -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(shard_index, - ops::ShardIndexNPUKernel, - ops::ShardIndexNPUKernel); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc deleted file mode 100644 index 0d4ad6331e807..0000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -const int kIgnoreIndex = -100; - -void CheckAttrs(const framework::ExecutionContext& ctx) { - // Add this check is due to Ascend SigmoidCrossEntropyWithLogits - // and SigmoidCrossEntropyWithLogitsGrad does't supoort - // attr normalize and ignore_index - bool normalize = ctx.Attr("normalize"); - int ignore_index = ctx.Attr("ignore_index"); - PADDLE_ENFORCE_EQ(normalize, - false, - platform::errors::InvalidArgument( - "attr normalize must be false, but got true")); - PADDLE_ENFORCE_EQ(ignore_index, - kIgnoreIndex, - platform::errors::InvalidArgument( - "attr ignore_index must be default %d, but got %d", - kIgnoreIndex, - ignore_index)); -} - -template -class SigmoidCrossEntropyWithLogitsNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - CheckAttrs(ctx); - - auto* x = ctx.Input("X"); - auto* label = ctx.Input("Label"); - - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner = - NpuOpRunner("SigmoidCrossEntropyWithLogits", {*x, *label}, {*out}, {}); - runner.Run(stream); - } -}; - -template -class SigmoidCrossEntropyWithLogitsNPUGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - CheckAttrs(ctx); - - auto* x = ctx.Input("X"); - auto* label = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto place = ctx.GetPlace(); - - dx->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); - - const auto& runner_dx = NpuOpRunner( - "SigmoidCrossEntropyWithLogitsGrad", {*x, *label, *dout}, {*dx}, {}); - runner_dx.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsNPUKernel, - ops::SigmoidCrossEntropyWithLogitsNPUKernel); -REGISTER_OP_NPU_KERNEL( - sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsNPUGradKernel, - ops::SigmoidCrossEntropyWithLogitsNPUGradKernel); diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc deleted file mode 100644 index 594b0cc18e886..0000000000000 --- a/paddle/fluid/operators/size_op_npu.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SizeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - Tensor cpu_tensor; - auto cpu_data = - cpu_tensor.mutable_data(out->dims(), platform::CPUPlace()); - cpu_data[0] = x->numel(); - paddle::framework::TensorCopy( - cpu_tensor, - ctx.GetPlace(), - ctx.template device_context(), - out); - ctx.template device_context().Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - size, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel, - ops::SizeNPUKernel); diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc deleted file mode 100644 index a54ba630b274c..0000000000000 --- a/paddle/fluid/operators/slice_op_npu.cc +++ /dev/null @@ -1,254 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/slice_utils.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -void UpdateAttr(const framework::DDim& in_dims, - const std::vector axes, - const std::vector starts, - const std::vector ends, - std::vector* offsets, - std::vector* size) { - int cnt = 0; - for (int i = 0; i < in_dims.size(); ++i) { - int start = 0; - int end = in_dims[i]; - // NOTE(zhiqiu): Becareful that cnt may > axes.size() and result in - // overflow. - int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; - if (axis == i) { - start = starts[cnt]; - if (start < 0) { - start = (start + in_dims[i]); - } - start = std::max(start, static_cast(0)); - end = ends[cnt]; - if (end < 0) { - end = (end + in_dims[i]); - } - end = std::min(end, static_cast(in_dims[i])); - cnt++; - } - - (*offsets)[i] = start; - (*size)[i] = end - start; - } -} - -template -class SliceNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - - auto axes_int = ctx.Attr>("axes"); - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - std::vector axes(axes_int.begin(), axes_int.end()); - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - - auto decrease_axis = ctx.Attr>("decrease_axis"); - auto infer_flags = ctx.Attr>("infer_flags"); - - const auto& in_dims = input->dims(); - - // Get the accurate attribute value of starts and ends - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - if (ctx.HasInput("StartsTensor")) { - starts = phi::GetVectorFromTensor( - ctx.Input("StartsTensor")); - } else if (starts_tensor_list.size() > 0) { - starts = GetDataFromTensorList(starts_tensor_list); - } - - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - if (ctx.HasInput("EndsTensor")) { - ends = phi::GetVectorFromTensor( - ctx.Input("EndsTensor")); - } else if (ends_tensor_list.size() > 0) { - ends = GetDataFromTensorList(ends_tensor_list); - } - - PADDLE_ENFORCE_EQ( - starts.size(), - axes.size(), - platform::errors::InvalidArgument( - "The size of starts must be equal to the size of axes.")); - PADDLE_ENFORCE_EQ( - ends.size(), - axes.size(), - platform::errors::InvalidArgument( - "The size of ends must be equal to the size of axes.")); - - if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") || - starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) { - // Infer output dims - auto out_dims = out->dims(); - auto slice_dims = out_dims; - for (size_t i = 0; i < axes.size(); ++i) { - // when start == -1 && end == start+1 - if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { - auto ret = - std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); - if (ret != decrease_axis.end()) { - ends[i] = in_dims[axes[i]]; - } - } - } - - phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends); - slice_dims = phi::funcs::GetSliceDims( - in_dims, axes, starts, ends, nullptr, nullptr); - out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis); - - out->Resize(out_dims); - } - - out->mutable_data(ctx.GetPlace()); - - std::vector offsets(in_dims.size()); - std::vector size(in_dims.size()); - - UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - - auto& dev_ctx = ctx.template device_context(); - auto stream = dev_ctx.stream(); -#if CANN_VERSION_CODE < 512000 - const auto& runner = - NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, { - "size", - size - }}); -#else - NpuOpRunner runner; - runner.SetType("Slice") - .AddInput(*input) - .AddInput(std::move(offsets)) - .AddInput(std::move(size)) - .AddOutput(*out); -#endif - runner.Run(stream); - } -}; - -template -class SliceGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dinput = - ctx.Output(framework::GradVarName("Input")); - - auto axes_int = ctx.Attr>("axes"); - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - std::vector axes(axes_int.begin(), axes_int.end()); - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - - // Get the accurate attribute value of starts and ends - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - if (ctx.HasInput("StartsTensor")) { - starts = phi::GetVectorFromTensor( - ctx.Input("StartsTensor")); - } else if (starts_tensor_list.size() > 0) { - starts = GetDataFromTensorList(starts_tensor_list); - } - - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - if (ctx.HasInput("EndsTensor")) { - ends = phi::GetVectorFromTensor( - ctx.Input("EndsTensor")); - } else if (ends_tensor_list.size() > 0) { - ends = GetDataFromTensorList(ends_tensor_list); - } - - const auto& in_dims = input->dims(); - int rank = in_dims.size(); - - std::vector offsets(rank); - std::vector size(rank); - UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - - std::vector> paddings(rank, std::vector(2)); - for (int i = 0; i < rank; ++i) { - paddings[i][0] = static_cast(offsets[i]); - paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); - } - - phi::DenseTensor tmp_dout; - tmp_dout.ShareDataWith(*dout); - auto out_dims = dout->dims(); - auto decrease_axis = ctx.Attr>("decrease_axis"); - auto decrease_size = decrease_axis.size(); - if (decrease_size > 0) { - if (decrease_size == static_cast(in_dims.size())) { - out_dims = phi::make_ddim(std::vector(decrease_size, 1)); - } else { - std::vector origin_out_shape(out_dims.size() + decrease_size, -1); - for (size_t i = 0; i < decrease_size; ++i) { - origin_out_shape[decrease_axis[i]] = 1; - } - int index = 0; - for (size_t i = 0; i < origin_out_shape.size(); ++i) { - if (origin_out_shape[i] == -1) { - origin_out_shape[i] = out_dims[index]; - ++index; - } - } - out_dims = phi::make_ddim(origin_out_shape); - } - tmp_dout.Resize(out_dims); - } - - dinput->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = - NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(slice, - ops::SliceNPUKernel, - ops::SliceNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::SliceNPUKernel, -#endif - ops::SliceNPUKernel); - -REGISTER_OP_NPU_KERNEL(slice_grad, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc deleted file mode 100644 index abb6353ca0d1d..0000000000000 --- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc +++ /dev/null @@ -1,218 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/smooth_l1_loss_op.h" - -namespace paddle { -namespace operators { - -template -class SmoothL1LossNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in_x = context.Input("X"); - auto* in_y = context.Input("Y"); - auto* inside_weight = context.Input("InsideWeight"); - auto* outside_weight = context.Input("OutsideWeight"); - auto* out_diff = context.Output("Diff"); - auto* out_loss = context.Output("Out"); - out_diff->mutable_data(context.GetPlace()); - out_loss->mutable_data(context.GetPlace()); - - auto sigma = context.Attr("sigma"); - T sigma2 = 1.0 / (sigma * sigma); - bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); - // out_diff = in_x - in_y - auto stream = - context.template device_context() - .stream(); - const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {}); - runner1.Run(stream); - - phi::DenseTensor no_reduce_loss(in_x->dtype()); - no_reduce_loss.Resize(in_x->dims()); - no_reduce_loss.mutable_data(context.GetPlace()); - // multiply inside weight before get the loss - if (has_weight) { - phi::DenseTensor tmp_diff(out_diff->dtype()); - tmp_diff.Resize(out_diff->dims()); - tmp_diff.mutable_data(context.GetPlace()); - const auto& runner2 = - NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {}); - runner2.Run(stream); - framework::TensorCopy( - tmp_diff, - context.GetPlace(), - context.template device_context(), - out_diff); - - phi::DenseTensor tmp_x(in_x->dtype()); - tmp_x.Resize(in_x->dims()); - tmp_x.mutable_data(context.GetPlace()); - - phi::DenseTensor tmp_y(in_y->dtype()); - tmp_y.Resize(in_y->dims()); - tmp_y.mutable_data(context.GetPlace()); - - // mul input and inside_weight - const auto& runner_x = - NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {}); - runner_x.Run(stream); - const auto& runner_y = - NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {}); - runner_y.Run(stream); - const auto& runner3 = NpuOpRunner("SmoothL1Loss", - {tmp_x, tmp_y}, - {no_reduce_loss}, - {{"sigma", sigma2}}); - runner3.Run(stream); - } else { - const auto& runner3 = NpuOpRunner("SmoothL1Loss", - {*in_x, *in_y}, - {no_reduce_loss}, - {{"sigma", sigma2}}); - runner3.Run(stream); - } - - // multiply outside weight and loss - // reduceSum because the output'shape must be [B,1] - if (has_weight) { - phi::DenseTensor tmp_loss(no_reduce_loss.dtype()); - tmp_loss.Resize(no_reduce_loss.dims()); - tmp_loss.mutable_data(context.GetPlace()); - const auto& runner4 = - NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {}); - runner4.Run(stream); - const auto& runner5 = - NpuOpRunner("ReduceSumD", - {tmp_loss}, - {*out_loss}, - {{"axes", std::vector{1}}, {"keep_dims", true}}); - runner5.Run(stream); - } else { - const auto& runner5 = - NpuOpRunner("ReduceSumD", - {no_reduce_loss}, - {*out_loss}, - {{"axes", std::vector{1}}, {"keep_dims", true}}); - runner5.Run(stream); - } - } -}; - -template -class SmoothL1LossGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* inside_weight = context.Input("InsideWeight"); - auto* outside_weight = context.Input("OutsideWeight"); - auto* diff = context.Input("Diff"); - auto* og = context.Input(framework::GradVarName("Out")); - auto* outx_grad = - context.Output(framework::GradVarName("X")); - auto* outy_grad = - context.Output(framework::GradVarName("Y")); - auto sigma = context.Attr("sigma"); - T sigma2 = 1.0 / (sigma * sigma); - bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr); - - auto stream = - context.template device_context() - .stream(); - - // diff == in_x - in_y == diff - 0 - phi::DenseTensor tmp_zero(diff->dtype()); - tmp_zero.Resize(diff->dims()); - tmp_zero.mutable_data(context.GetPlace()); - const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {}); - runner_zero.Run(stream); - - phi::DenseTensor grad(diff->dtype()); - grad.Resize(diff->dims()); - grad.mutable_data(context.GetPlace()); - // broadcast og(output_grad) to adapt to the npu interface - const auto& runner_broad = - NpuOpRunner("BroadcastToD", - {*og}, - {grad}, - {{"shape", phi::vectorize(diff->dims())}}); - runner_broad.Run(stream); - - phi::DenseTensor gradient(diff->dtype()); - gradient.Resize(diff->dims()); - gradient.mutable_data(context.GetPlace()); - // diff == diff - 0 == in_x - in_y - const auto& runner_grad = NpuOpRunner("SmoothL1LossGrad", - {*diff, tmp_zero, grad}, - {gradient}, - {{"sigma", sigma2}}); - runner_grad.Run(stream); - - // mul weight and gradient - if (has_weight) { - phi::DenseTensor weight(inside_weight->dtype()); - weight.Resize(inside_weight->dims()); - weight.mutable_data(context.GetPlace()); - const auto& runner_weight = - NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {}); - runner_weight.Run(stream); - - phi::DenseTensor tmp_grad(gradient.dtype()); - tmp_grad.Resize(gradient.dims()); - tmp_grad.mutable_data(context.GetPlace()); - const auto& runner_weight_grad = - NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {}); - runner_weight_grad.Run(stream); - - framework::TensorCopy( - tmp_grad, - context.GetPlace(), - context.template device_context(), - &gradient); - } - // outx_grad = gradient - if (outx_grad) { - outx_grad->mutable_data(context.GetPlace()); - framework::TensorCopy( - gradient, - context.GetPlace(), - context.template device_context(), - outx_grad); - } - - // outy_grad = - gradient - if (outy_grad) { - outy_grad->mutable_data(context.GetPlace()); - phi::DenseTensor coeff(phi::DataType::FLOAT32); - coeff.mutable_data({1}, context.GetPlace()); - FillNpuTensorWithConstant(&coeff, -1); - const auto& runner_y_grad = - NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {}); - runner_y_grad.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - smooth_l1_loss, - ops::SmoothL1LossNPUKernel); - -REGISTER_OP_NPU_KERNEL( - smooth_l1_loss_grad, - ops::SmoothL1LossGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc deleted file mode 100644 index de7df0de5b3d5..0000000000000 --- a/paddle/fluid/operators/softmax_op_npu.cc +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" - -namespace paddle { -namespace operators { - -template -class SoftmaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto axis = ctx.Attr("axis"); - std::vector axes; - axes.push_back(axis); - framework::NPUAttributeMap attr_input = {{"axes", axes}}; - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class SoftmaxGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dOut = ctx.Input(framework::GradVarName("Out")); - - auto* dX = ctx.Output(framework::GradVarName("X")); - - auto dims = dX->dims(); - const int rank = dims.size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - int64_t first_dim = 1; - int64_t sec_dim = 1; - for (int i = 0; i < axis; i++) { - first_dim *= dims[i]; - } - for (int i = axis; i < rank; i++) { - sec_dim *= dims[i]; - } - - Tensor tmp_out; - tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim}); - - Tensor tmp_dOut; - tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim}); - - dX->Resize(phi::make_ddim({first_dim, sec_dim})); - dX->mutable_data(ctx.GetPlace()); - - framework::NPUAttributeMap attr_input = {}; - const auto& runner = NpuOpRunner( - std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, {*dX}, attr_input); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - dX->Resize(dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - softmax, - ops::SoftmaxNPUKernel, - ops::SoftmaxNPUKernel, - ops::SoftmaxNPUKernel); - -REGISTER_OP_NPU_KERNEL( - softmax_grad, - ops::SoftmaxGradNPUKernel, - ops::SoftmaxGradNPUKernel, - ops::SoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc deleted file mode 100644 index dd1462b1c07cc..0000000000000 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(softmax); -USE_OP_DEVICE_KERNEL(softmax, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - std::vector init; - for (int i = 3; i < 9; ++i) { - init.push_back(static_cast(i)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({2, 3}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - tensor_out->Resize({2, 3}); - tensor_out->mutable_data(place); // allocate - - // run - int axis = 1; - f::AttributeMap attrs = { - {"axis", axis}, - {"use_cudnn", false}, - {"use_mkldnn", false}, - {"mkldnn_data_type", std::string("float32")}, - {"is_test", false}, - }; - - auto op = f::OpRegistry::CreateOp( - "softmax", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - - for (int i = 0; i < static_cast(out_vec.size()); ++i) { - VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; - } - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector out_init; - - out_init.push_back(static_cast(0.6670)); - out_init.push_back(static_cast(0.5888)); - out_init.push_back(static_cast(0.4543)); - out_init.push_back(static_cast(0.3330)); - out_init.push_back(static_cast(0.4112)); - out_init.push_back(static_cast(0.5457)); - - paddle::framework::TensorFromVector(out_init, ctx, tensor_out); - tensor_out->Resize({2, 3}); - - ctx.Wait(); - - auto dout = scope->Var("DOut"); - auto tensor_dout = dout->GetMutable(); - - std::vector dout_init; - for (int i = 0; i < 6; ++i) { - dout_init.push_back(static_cast(1.0)); - } - - paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout); - tensor_dout->Resize({2, 3}); - - ctx.Wait(); - - auto dx = scope->Var("DX"); - auto tensor_dx = dx->GetMutable(); - - ctx.Wait(); - - // run - f::AttributeMap attrs; - attrs = { - {"name", std::string("softmax_grad")}, - {"axis", static_cast(0)}, - {"use_cudnn", false}, - {"use_mkldnn", false}, - {"mkldnn_data_type", std::string("float32")}, - {"is_test", false}, - {"data_format", std::string("AnyLayout")}, - }; - auto op = f::OpRegistry::CreateOp("softmax_grad", - {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}}, - {{"X@GRAD", {"DX"}}}, - attrs); - - auto place = ctx.GetPlace(); - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2)); - EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3)); - - ctx.Wait(); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec); - - ctx.Wait(); - - EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); - EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1); - EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1); - EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1); - EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1); - EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1); - EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1); -} - -TEST(softmax, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(softmax_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc deleted file mode 100644 index af0e9d55445d5..0000000000000 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/axis_utils.h" -#include "paddle/phi/kernels/funcs/cross_entropy.h" -#include "paddle/phi/kernels/funcs/softmax.h" - -namespace paddle { -namespace operators { - -template -class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* logits = ctx.Input("Logits"); - auto* labels = ctx.Input("Label"); - auto* softmax = ctx.Output("Softmax"); - auto* loss = ctx.Output("Loss"); - auto* backprop = ctx.Output("Backprop"); - auto soft_label = ctx.Attr("soft_label"); - PADDLE_ENFORCE_EQ(soft_label, - false, - platform::errors::Unimplemented( - "soft_label=True is not supported in " - "the npu kernel of softmax_with_cross_entropy.")); - - const int rank = logits->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - const int n = phi::funcs::SizeToAxis(axis, logits->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); - - PADDLE_ENFORCE_EQ( - labels->numel(), - n, - platform::errors::Unimplemented( - "The size of labels should be equal to phi::funcs::SizeToAxis of " - "logits," - "but got size of labels is %d and phi::funcs::SizeToAxis is %d.", - labels->numel(), - n)); - - loss->mutable_data(ctx.GetPlace()); - backprop->mutable_data(ctx.GetPlace()); - softmax->mutable_data(ctx.GetPlace()); - - phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d; - logits_2d.ShareDataWith(*logits).Resize({n, d}); - labels_1d.ShareDataWith(*labels).Resize({n}); - loss_1d.ShareDataWith(*loss).Resize({n}); - backprop_2d.ShareDataWith(*backprop).Resize({n, d}); - softmax_2d.ShareDataWith(*softmax).Resize({n, d}); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - for (auto i = axis; i < logits->dims().size(); ++i) { - axes.push_back(i); - } - const auto& runner_softmax = - NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}}); - runner_softmax.Run(stream); - - // SparseSoftmaxCrossEntropyWithLogits - const auto& runner_s = NpuOpRunner("SparseSoftmaxCrossEntropyWithLogits", - {logits_2d, labels_1d}, - {loss_1d, backprop_2d}, - {}); - runner_s.Run(stream); - } -}; - -template -class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* backprop = ctx.Input("Backprop"); - auto* loss_grad = - ctx.Input(framework::GradVarName("Loss")); - auto* logits_grad = - ctx.Output(framework::GradVarName("Logits")); - - PADDLE_ENFORCE_NOT_NULL(backprop, - platform::errors::PreconditionNotMet( - "backprop should not be null in NPU kernel of " - "softmax_with_cross_entropy_grad.")); - logits_grad->mutable_data(ctx.GetPlace()); - - const int rank = logits_grad->dims().size(); - const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); - const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims()); - const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims()); - - phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d; - - logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d}); - loss_grad_1d.ShareDataWith(*loss_grad).Resize({n}); - backprop_2d.ShareDataWith(*backprop).Resize({n, d}); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner_mul = - NpuOpRunner("Mul", {*loss_grad, *backprop}, {*logits_grad}, {}); - runner_mul.Run(stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyNPUKernel, - ops::SoftmaxWithCrossEntropyNPUKernel); -REGISTER_OP_NPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradNPUKernel< - paddle::platform::NPUDeviceContext, - float>, - ops::SoftmaxWithCrossEntropyGradNPUKernel< - paddle::platform::NPUDeviceContext, - paddle::platform::float16>); diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc deleted file mode 100644 index 763b375d00e9b..0000000000000 --- a/paddle/fluid/operators/split_op_npu.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/split_op.h" - -namespace paddle { -namespace operators { - -template -class SplitNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto outs = ctx.MultiOutput("Out"); - int num = ctx.Attr("num"); - std::vector sections = ctx.Attr>("sections"); - int axis = ctx.Attr("axis"); - - if (ctx.HasInput("AxisTensor")) { - // TODO(liupeng51): - PADDLE_THROW(platform::errors::Unimplemented( - "The AxisTensor is not supported on NPU now.")); - } - if (ctx.HasInput("SectionsTensorList")) { - // TODO(liupeng51): - PADDLE_THROW(platform::errors::Unimplemented( - "The SectionsTensorList is not supported on NPU now.")); - } - - std::vector outputs; - for (size_t j = 0; j < outs.size(); ++j) { - outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(*outs[j]); - } - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner; - if (sections.size() == 0) { - framework::NPUAttributeMap attr_input = {{"num_split", num}, - {"split_dim", axis}}; - runner.SetType("SplitD").AddInputs({*in}).AddOutputs(outputs).AddAttrs( - attr_input); - } else { - framework::NPUAttributeMap attr_input = { - {"size_splits", sections}, - {"split_dim", axis}, - {"num_split", static_cast(sections.size())}}; - runner.SetType("SplitVD").AddInput(*in).AddOutputs(outputs).AddAttrs( - attr_input); - } - - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL(split, - ops::SplitNPUKernel, - ops::SplitNPUKernel, - ops::SplitNPUKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc deleted file mode 100644 index fb7d4607fc085..0000000000000 --- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class SquaredL2NormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *out = context.Output("Out"); - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - std::vector axis; - for (int i = 0; i < x->dims().size(); ++i) { - axis.push_back(i); - } - out->mutable_data(place); - const auto &runner = NpuOpRunner( - "SquareSumV1", {*x}, {*out}, {{"axis", axis}, {"keep_dims", false}}); - runner.Run(stream); - } -}; - -template -class SquaredL2NormGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *x_grad = - context.Output(framework::GradVarName("X")); - auto *out_grad = - context.Input(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ( - out_grad->numel(), - 1, - platform::errors::InvalidArgument( - "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar.")); - - auto place = context.GetPlace(); - auto stream = - context.template device_context() - .stream(); - - // broadcast out_grad - phi::DenseTensor broadcasted_out_grad; - broadcasted_out_grad.mutable_data(x_grad->dims(), place); - const auto &broadcast_runner = - NpuOpRunner("BroadcastToD", - {*out_grad}, - {broadcasted_out_grad}, - {{"shape", phi::vectorize(x_grad->dims())}}); - broadcast_runner.Run(stream); - // mul x - phi::DenseTensor tmp_x_grad; - tmp_x_grad.mutable_data(x_grad->dims(), place); - const auto &mul_x_runner = - NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {}); - mul_x_runner.Run(stream); - // mul coefficient:2 - phi::DenseTensor coefficient; - coefficient.mutable_data({1}, place); - FillNpuTensorWithConstant(&coefficient, static_cast(2.0)); - x_grad->mutable_data(place); - const auto &mul_coefficient_runner = - NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {}); - mul_coefficient_runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - squared_l2_norm, - ops::SquaredL2NormNPUKernel); -REGISTER_OP_NPU_KERNEL( - squared_l2_norm_grad, - ops::SquaredL2NormGradNPUKernel); diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc deleted file mode 100644 index 308f092ad740f..0000000000000 --- a/paddle/fluid/operators/squeeze_op_npu.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/squeeze_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel); -REGISTER_OP_NPU_KERNEL( - squeeze2, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel); -REGISTER_OP_NPU_KERNEL( - squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel); -REGISTER_OP_NPU_KERNEL( - squeeze2_grad, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel, - ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc deleted file mode 100644 index f0f683e488246..0000000000000 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(squeeze); -USE_OP_DEVICE_KERNEL(squeeze, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - int dim0 = 1; - int dim1 = 10; - int dim2 = 1; - - std::vector init; - for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) { - init.push_back(static_cast(0.1)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({dim0, dim1, dim2}); - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector axis; - axis.push_back(2); - f::AttributeMap attrs = {{"axes", axis}}; - - auto op = f::OpRegistry::CreateOp( - "squeeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2)); - EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0)); - EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1)); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(0.1)); - } - - ctx.Wait(); -} - -TEST(squeeze, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc deleted file mode 100644 index 8c6447971d9ad..0000000000000 --- a/paddle/fluid/operators/stack_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class StackNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.MultiInput("X"); - auto* y = ctx.Output("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - int num = static_cast(x.size()); - - PADDLE_ENFORCE_GT(num, - 0, - platform::errors::InvalidArgument( - "number of input phi::DenseTensor <= 0")); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector x_list; - for (int i = 0; i < num; i++) { - x_list.push_back(*x[i]); - } - y->mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); - runner.Run(stream); - } -}; - -template -class StackGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy = ctx.Input(framework::GradVarName("Y")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int num = dy->dims()[axis]; - - PADDLE_ENFORCE_GT(num, - 0, - platform::errors::InvalidArgument( - "number of input phi::DenseTensor <= 0")); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector dx_list; - for (int i = 0; i < num; i++) { - dx[i]->mutable_data(ctx.GetPlace()); - dx_list.push_back(*dx[i]); - } - - const auto& runner = - NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL( - stack, - paddle::operators::StackNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::StackNPUKernel, -#endif - paddle::operators::StackNPUKernel, - paddle::operators::StackNPUKernel); - -REGISTER_OP_NPU_KERNEL( - stack_grad, - paddle::operators::StackGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - paddle::operators::StackGradNPUKernel, -#endif - paddle::operators::StackGradNPUKernel, - paddle::operators::StackGradNPUKernel); diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc deleted file mode 100644 index 4c3bfed5d5d4b..0000000000000 --- a/paddle/fluid/operators/strided_slice_op_npu.cc +++ /dev/null @@ -1,480 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/strided_slice.h" - -namespace paddle { -namespace operators { - -using Variable = framework::Variable; -using LoDTensorArray = framework::LoDTensorArray; -using DDim = framework::DDim; - -template -class StridedSliceNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - PADDLE_ENFORCE_EQ(is_tensor_array, - false, - platform::errors::InvalidArgument( - "phi::DenseTensor array as input is not supported.")); - int rank = ctx.Input("Input")->dims().size(); - switch (rank) { - case 1: - StridedSliceCompute<1>(ctx); - break; - case 2: - StridedSliceCompute<2>(ctx); - break; - case 3: - StridedSliceCompute<3>(ctx); - break; - case 4: - StridedSliceCompute<4>(ctx); - break; - case 5: - StridedSliceCompute<5>(ctx); - break; - case 6: - StridedSliceCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input is supported up to 6.")); - break; - } - } - - private: - template - void StridedSliceCompute(const framework::ExecutionContext& ctx) const { - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - auto in = ctx.Input("Input"); - auto out = ctx.Output("Out"); - auto in_dims = in->dims(); - - // list - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - auto strides_int = ctx.Attr>("strides"); - - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - std::vector strides(strides_int.begin(), strides_int.end()); - - auto axes = ctx.Attr>("axes"); - auto infer_flags = ctx.Attr>("infer_flags"); - auto decrease_axis = ctx.Attr>("decrease_axis"); - - // vector> - auto list_new_ends_tensor = - ctx.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - ctx.MultiInput("StartsTensorList"); - auto list_new_strides_tensor = - ctx.MultiInput("StridesTensorList"); - - // phi::DenseTensor - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } else if (ctx.HasInput("StartsTensor")) { - auto* starts_tensor = ctx.Input("StartsTensor"); - starts = phi::GetVectorFromTensor(starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } else if (ctx.HasInput("EndsTensor")) { - auto* ends_tensor = ctx.Input("EndsTensor"); - ends = phi::GetVectorFromTensor(ends_tensor); - } - - if (list_new_strides_tensor.size() > 0) { - strides = GetDataFromTensorList(list_new_strides_tensor); - } else if (ctx.HasInput("StridesTensor")) { - auto* strides_tensor = ctx.Input("StridesTensor"); - strides = phi::GetVectorFromTensor(strides_tensor); - } - - // out dims calculation - std::vector out_dims_vector(in_dims.size(), -1); - phi::funcs::StridedSliceOutDims(starts, - ends, - strides, - axes, - infer_flags, - in_dims, - decrease_axis, - out_dims_vector.data(), - axes.size(), - false); - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - // check whether need to reverse (false: stride > 0; true: stride < 0) - std::vector reverse_vector(starts.size(), 0); - phi::funcs::StridedSliceFunctor(starts.data(), - ends.data(), - strides.data(), - axes.data(), - reverse_vector.data(), - in_dims, - infer_flags, - decrease_axis, - starts.size()); - - // construct the starts_indices, ends_indices and strides_indices tensor for - // calling StridedSlice op - std::vector starts_indices_vector(D, 0); - std::vector ends_indices_vector(out_dims_vector.begin(), - out_dims_vector.end()); - std::vector strides_indices_vector(D, 1); - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices_vector[axis_index] = starts[axis]; - ends_indices_vector[axis_index] = ends[axis]; - strides_indices_vector[axis_index] = strides[axis]; - } - - phi::DenseTensor starts_indices_tensor; - phi::DenseTensor ends_indices_tensor; - phi::DenseTensor strides_indices_tensor; - - starts_indices_tensor.mutable_data({D}, place); - ends_indices_tensor.mutable_data({D}, place); - strides_indices_tensor.mutable_data({D}, place); - - paddle::framework::TensorFromVector( - starts_indices_vector, ctx.device_context(), &starts_indices_tensor); - paddle::framework::TensorFromVector( - ends_indices_vector, ctx.device_context(), &ends_indices_tensor); - paddle::framework::TensorFromVector( - strides_indices_vector, ctx.device_context(), &strides_indices_tensor); - - auto out_dims_origin = out_dims; - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - PADDLE_ENFORCE_EQ( - out_dims[decrease_axis[i]], - 1, - platform::errors::InvalidArgument( - "the size of decrease dimension should be 1, but received %d.", - out_dims[decrease_axis[i]])); - out_dims_origin[decrease_axis[i]] = 0; - } - - for (int i = 0; i < out_dims_origin.size(); ++i) { - if (out_dims_origin[i] != 0) { - new_out_shape.push_back(out_dims_origin[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } - out_dims_origin = phi::make_ddim(new_out_shape); - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - out->Resize(out_dims); - out->mutable_data(place); - - const auto& runner = NpuOpRunner("StridedSlice", - {*in, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor}, - {*out}, - {{"begin_mask", 0}, - {"end_mask", 0}, - {"ellipsis_mask", 0}, - {"new_axis_mask", 0}, - {"shrink_axis_mask", 0}}); - runner.Run(stream); - - if (need_reverse) { - phi::DenseTensor out_tmp; - out_tmp.mutable_data(out_dims, place); - paddle::framework::TensorCopy( - *out, - place, - ctx.template device_context(), - &out_tmp); - - phi::DenseTensor reverse_axis; - std::vector reverse_axis_vector; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - reverse_axis_vector.push_back(axes[axis]); - } - } - reverse_axis.mutable_data( - {static_cast(reverse_axis_vector.size())}, place); - paddle::framework::TensorFromVector( - reverse_axis_vector, ctx.device_context(), &reverse_axis); - - const auto& runner_reverse = - NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out}); - runner_reverse.Run(stream); - } - - if (decrease_axis.size() > 0) { - out->Resize(out_dims_origin); - } - } -}; - -template -class StridedSliceGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - PADDLE_ENFORCE_EQ(is_tensor_array, - false, - platform::errors::InvalidArgument( - "phi::DenseTensor array as input is not supported.")); - int rank = ctx.Input("Input")->dims().size(); - - switch (rank) { - case 1: - StridedSliceGradCompute<1>(ctx); - break; - case 2: - StridedSliceGradCompute<2>(ctx); - break; - case 3: - StridedSliceGradCompute<3>(ctx); - break; - case 4: - StridedSliceGradCompute<4>(ctx); - break; - case 5: - StridedSliceGradCompute<5>(ctx); - break; - case 6: - StridedSliceGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input is supported up to 6.")); - break; - } - } - - private: - template - void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const { - auto place = ctx.GetPlace(); - auto& dev_ctx = - ctx.template device_context(); - - auto* input = ctx.Input("Input"); - auto input_dims = input->dims(); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("Input")); - dx->mutable_data(input_dims, place); - - auto starts_int = ctx.Attr>("starts"); - auto ends_int = ctx.Attr>("ends"); - auto strides_int = ctx.Attr>("strides"); - - std::vector starts(starts_int.begin(), starts_int.end()); - std::vector ends(ends_int.begin(), ends_int.end()); - std::vector strides(strides_int.begin(), strides_int.end()); - - auto axes = ctx.Attr>("axes"); - auto infer_flags = ctx.Attr>("infer_flags"); - auto decrease_axis = ctx.Attr>("decrease_axis"); - - auto list_new_ends_tensor = - ctx.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - ctx.MultiInput("StartsTensorList"); - auto list_new_strides_tensor = - ctx.MultiInput("StridesTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } else if (ctx.HasInput("StartsTensor")) { - auto* starts_tensor = ctx.Input("StartsTensor"); - starts = phi::GetVectorFromTensor(starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } else if (ctx.HasInput("EndsTensor")) { - auto* ends_tensor = ctx.Input("EndsTensor"); - ends = phi::GetVectorFromTensor(ends_tensor); - } - - if (list_new_strides_tensor.size() > 0) { - strides = GetDataFromTensorList(list_new_strides_tensor); - } else if (ctx.HasInput("StridesTensor")) { - auto* strides_tensor = ctx.Input("StridesTensor"); - strides = phi::GetVectorFromTensor(strides_tensor); - } - - std::vector out_dims_vector(input_dims.size(), -1); - phi::funcs::StridedSliceOutDims(starts, - ends, - strides, - axes, - infer_flags, - input_dims, - decrease_axis, - out_dims_vector.data(), - axes.size(), - false); - - std::vector reverse_vector(starts.size(), 0); - phi::funcs::StridedSliceFunctor(starts.data(), - ends.data(), - strides.data(), - axes.data(), - reverse_vector.data(), - input_dims, - infer_flags, - decrease_axis, - starts.size()); - - std::vector starts_indices_vector(D, 0); - std::vector ends_indices_vector(out_dims_vector.begin(), - out_dims_vector.end()); - std::vector strides_indices_vector(D, 1); - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices_vector[axis_index] = starts[axis]; - ends_indices_vector[axis_index] = ends[axis]; - strides_indices_vector[axis_index] = strides[axis]; - } - - phi::DenseTensor starts_indices_tensor; - phi::DenseTensor ends_indices_tensor; - phi::DenseTensor strides_indices_tensor; - - starts_indices_tensor.mutable_data({D}, place); - ends_indices_tensor.mutable_data({D}, place); - strides_indices_tensor.mutable_data({D}, place); - - paddle::framework::TensorFromVector( - starts_indices_vector, dev_ctx, &starts_indices_tensor); - paddle::framework::TensorFromVector( - ends_indices_vector, dev_ctx, &ends_indices_tensor); - paddle::framework::TensorFromVector( - strides_indices_vector, dev_ctx, &strides_indices_tensor); - - std::vector input_dims_vector; - for (int i = 0; i < input_dims.size(); i++) { - input_dims_vector.push_back(input_dims[i]); - } - phi::DenseTensor input_dims_tensor; - paddle::framework::TensorFromVector( - input_dims_vector, dev_ctx, &input_dims_tensor); - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto stream = dev_ctx.stream(); - framework::NPUAttributeMap attr_input = {{"begin_mask", 0}, - {"end_mask", 0}, - {"ellipsis_mask", 0}, - {"new_axis_mask", 0}, - {"shrink_axis_mask", 0}}; - - if (need_reverse) { - phi::DenseTensor reverse_axis; - std::vector reverse_axis_vector; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - reverse_axis_vector.push_back(axes[axis]); - } - } - reverse_axis.mutable_data( - {static_cast(reverse_axis_vector.size())}, place); - paddle::framework::TensorFromVector( - reverse_axis_vector, dev_ctx, &reverse_axis); - - phi::DenseTensor dout_tmp; - dout_tmp.mutable_data(dout->dims(), place); - const auto& runner_reverse = - NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp}); - runner_reverse.Run(stream); - - const auto& runner = NpuOpRunner("StridedSliceGrad", - {input_dims_tensor, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor, - dout_tmp}, - {*dx}, - attr_input); - runner.Run(stream); - } else { - const auto& runner = NpuOpRunner("StridedSliceGrad", - {input_dims_tensor, - starts_indices_tensor, - ends_indices_tensor, - strides_indices_tensor, - *dout}, - {*dx}, - attr_input); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_NPU_KERNEL( - strided_slice, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel, - ops::StridedSliceNPUKernel); - -REGISTER_OP_NPU_KERNEL( - strided_slice_grad, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel, - ops::StridedSliceGradNPUKernel); diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc deleted file mode 100644 index 5d1656b79e9a8..0000000000000 --- a/paddle/fluid/operators/sum_op_npu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using SelectedRows = phi::SelectedRows; - -template -class SumNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto out_var = ctx.OutputVar("Out"); - if (out_var->IsType()) { - auto *out = out_var->GetMutable(); - auto x = ctx.MultiInput("X"); - out->mutable_data(ctx.GetPlace()); - - auto place = ctx.GetPlace(); - - int n = static_cast(x.size()); - if (n == 1) { - paddle::framework::TensorCopy(*x[0], place, out); - return; - } - - std::vector inputs; - std::vector names; - for (int i = 0; i < n; ++i) { - if (x[i] && x[i]->numel() > 0) { - inputs.push_back(*x[i]); - names.push_back("x" + std::to_string(i)); - } else { - continue; - } - } - - auto stream = - ctx.template device_context() - .stream(); - NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; - runner.AddInputNames(names); - runner.Run(stream); - } else if (out_var->IsType()) { - auto in_vars = ctx.MultiInputVar("X"); - bool in_place = out_var == in_vars[0]; - auto &out_array = *out_var->GetMutable(); - for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { - PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), - true, - platform::errors::InvalidArgument( - "Only support all inputs are TensorArray, " - "but inputs[%d] is not TensorArray.", - i)); - auto &in_array = in_vars[i]->Get(); - - for (size_t i = 0; i < in_array.size(); ++i) { - if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) { - if (i >= out_array.size()) { - out_array.resize(i + 1); - } - if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) { - framework::TensorCopy(in_array[i], - in_array[i].place(), - ctx.device_context(), - &out_array[i]); - out_array[i].set_lod(in_array[i].lod()); - } else { - PADDLE_ENFORCE_EQ( - out_array[i].lod(), - in_array[i].lod(), - platform::errors::InvalidArgument( - "The lod message between inputs[%d] and" - " outputs[%d] must be same, but now is not same.", - i, - i)); - auto stream = ctx.template device_context< - paddle::platform::NPUDeviceContext>() - .stream(); - NpuOpRunner runner{ - "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}}; - runner.Run(stream); - } - } - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected type of Output(out) must be phi::DenseTensor or " - "LoDTensorArray. But got " - "unsupport type: %s.", - framework::ToTypeName(out_var->Type()))); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - sum, - ops::SumNPUKernel, - ops::SumNPUKernel); diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc deleted file mode 100644 index 1b3ed3ccc7a73..0000000000000 --- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc +++ /dev/null @@ -1,1105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the Licnse. */ - -#include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/platform/collective_helper.h" - -namespace paddle { -namespace operators { - -template -void training_or_inference(const framework::ExecutionContext &ctx, - const aclrtStream &stream, - const platform::Place &place, - const DataLayout &layout, - const bool &test_mode, - const int &N, - const int &C, - const int &H, - const int &W, - const float epsilon, - const float &momentum, - const phi::DenseTensor *common_mean, - const phi::DenseTensor *common_var, - const phi::DenseTensor *x, - const phi::DenseTensor *scale, - const phi::DenseTensor *bias, - const phi::DenseTensor *mean, - const phi::DenseTensor *variance, - phi::DenseTensor *mean_out, - phi::DenseTensor *variance_out, - phi::DenseTensor *saved_mean, - phi::DenseTensor *saved_variance, - phi::DenseTensor *y) { - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - std::vector multiples; - if (layout == phi::DataLayout::kNCHW) - multiples = {N, 1, H, W}; - else if (layout == phi::DataLayout::kNHWC) - multiples = {N, H, W, 1}; - - phi::DenseTensor common_mean_tile_1; - { - common_mean_tile_1.Resize({C}); - common_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - common_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - common_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor common_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - common_mean_tile.Resize(x->dims()); - common_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {common_mean_tile_1}, {common_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_1; - { - common_var_tile_1.Resize({C}); - common_var_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1); - if (layout == phi::DataLayout::kNCHW) - common_var_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - common_var_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor common_var_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - common_var_tile.Resize(x->dims()); - common_var_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {common_var_tile_1}, {common_var_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - common_var_tile_add_epsilon.Resize(x->dims()); - common_var_tile_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {common_var_tile}, {common_var_tile_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor common_var_tile_add_epsilon_sqrt; - { - common_var_tile_add_epsilon_sqrt.Resize(x->dims()); - common_var_tile_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner("Sqrt", - {common_var_tile_add_epsilon}, - {common_var_tile_add_epsilon_sqrt}, - {}); - runner.Run(stream); - } - - phi::DenseTensor x_sub_common_mean; - { - x_sub_common_mean.Resize(x->dims()); - x_sub_common_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*x, common_mean_tile}, {x_sub_common_mean}, {}); - runner.Run(stream); - } - - phi::DenseTensor normalized; - { - normalized.Resize(x->dims()); - normalized.mutable_data(place); - const auto &runner = - NpuOpRunner("Div", - {x_sub_common_mean, common_var_tile_add_epsilon_sqrt}, - {normalized}, - {}); - runner.Run(stream); - } - - phi::DenseTensor scale_tile_1; - { - scale_tile_1.Resize({C}); - scale_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); - if (layout == phi::DataLayout::kNCHW) - scale_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - scale_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor scale_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - scale_tile.Resize(x->dims()); - scale_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor normalized_mul_scale; - { - normalized_mul_scale.Resize(x->dims()); - normalized_mul_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {normalized, scale_tile}, {normalized_mul_scale}, {}); - runner.Run(stream); - } - - phi::DenseTensor bias_tile_1; - { - bias_tile_1.Resize({C}); - bias_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*bias, place, &bias_tile_1); - if (layout == phi::DataLayout::kNCHW) - bias_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - bias_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor bias_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - bias_tile.Resize(x->dims()); - bias_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {bias_tile_1}, {bias_tile}, attr_input); - runner.Run(stream); - } - - // calculate y - { - y->mutable_data(place); - const auto &runner = - NpuOpRunner("Add", {normalized_mul_scale, bias_tile}, {*y}, {}); - runner.Run(stream); - } - - if (!test_mode) { - phi::DenseTensor ones; - { - ones.Resize({C}); - ones.mutable_data(place); - FillNpuTensorWithConstant(&ones, 1); - } - - // cacl mean_out - { - phi::DenseTensor common_mean_mul_1_sub_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; - common_mean_mul_1_sub_momentum.Resize({C}); - common_mean_mul_1_sub_momentum.mutable_data(place); - const auto &runner = NpuOpRunner("Muls", - {*common_mean}, - {common_mean_mul_1_sub_momentum}, - attr_input); - runner.Run(stream); - } - - phi::DenseTensor mean_mul_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", momentum}}; - mean_mul_momentum.Resize({C}); - mean_mul_momentum.mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {*mean}, {mean_mul_momentum}, attr_input); - runner.Run(stream); - } - - mean_out->mutable_data(place); - - const auto &runner = - NpuOpRunner("Add", - {common_mean_mul_1_sub_momentum, mean_mul_momentum}, - {*mean_out}, - {}); - runner.Run(stream); - } - - // cacl variance_out - { - phi::DenseTensor momentum_mul_var; - { - framework::NPUAttributeMap attr_input = {{"value", momentum}}; - momentum_mul_var.Resize({C}); - momentum_mul_var.mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {*variance}, {momentum_mul_var}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_mul_1_sub_momentum; - { - framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; - var_ref_mul_1_sub_momentum.Resize({C}); - var_ref_mul_1_sub_momentum.mutable_data(place); - const auto &runner = NpuOpRunner( - "Muls", {*common_var}, {var_ref_mul_1_sub_momentum}, attr_input); - runner.Run(stream); - } - - variance_out->mutable_data(place); - - const auto &runner = - NpuOpRunner("Add", - {var_ref_mul_1_sub_momentum, momentum_mul_var}, - {*variance_out}, - {}); - runner.Run(stream); - } - - // cacl saved_variance - { - phi::DenseTensor var_ref_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - var_ref_add_epsilon.Resize({C}); - var_ref_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {*common_var}, {var_ref_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_add_epsilon_sqrt; - { - var_ref_add_epsilon_sqrt.Resize({C}); - var_ref_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner( - "Sqrt", {var_ref_add_epsilon}, {var_ref_add_epsilon_sqrt}, {}); - runner.Run(stream); - } - - saved_variance->mutable_data(place); - - const auto &runner = NpuOpRunner( - "Div", {ones, var_ref_add_epsilon_sqrt}, {*saved_variance}, {}); - runner.Run(stream); - } - } -} - -template -class SyncBatchNormNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const std::string layout_str = ctx.Attr("data_layout"); - const DataLayout layout = phi::StringToDataLayout(layout_str); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - - PADDLE_ENFORCE_EQ(use_global_stats, - false, - platform::errors::InvalidArgument( - "sync_batch_norm doesn't support " - "to set use_global_stats True. Please use batch_norm " - "in this case.")); - - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Y"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *mean = ctx.Input("Mean"); - const auto *variance = ctx.Input("Variance"); - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), - 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, - x_dims.size())); - - int N, C, H, W, D; - phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); - - int x_numel = x->numel(); - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - bool test_mode = is_test && (!trainable_stats); - if (test_mode) { // inference - // cacl saved_mean - saved_mean->mutable_data(place); - paddle::framework::TensorCopySync(*mean, place, saved_mean); - - // cacl saved_variance - saved_variance->mutable_data(place); - paddle::framework::TensorCopySync(*variance, place, saved_variance); - - // cacl y - training_or_inference(ctx, - stream, - place, - layout, - test_mode, - N, - C, - H, - W, - epsilon, - momentum, - mean, - variance, - x, - scale, - bias, - mean, - variance, - NULL, - NULL, - NULL, - NULL, - y); - - } else { // training - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - phi::DenseTensor mom_cpu; - paddle::framework::TensorCopySync( - *mom_tensor, platform::CPUPlace(), &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // cacl saved_mean and var_ref - phi::DenseTensor var_ref; - var_ref.Resize({C}); - var_ref.mutable_data(place); - { - phi::DenseTensor x_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_sum.Resize({C}); - x_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*x}, {x_sum}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_square; - { - x_square.Resize(x->dims()); - x_square.mutable_data(place); - const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_square_sum.Resize({C}); - x_square_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); - runner.Run(stream); - } - - auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); - - float device_counts = 0.0; - if (comm) { - HcclDataType dtype = platform::ToHCCLDataType( - framework::TransToProtoVarType(mean_out->dtype())); - - phi::DenseTensor device_count_tensor; - { - device_count_tensor.Resize({1}); - device_count_tensor.mutable_data(place); - FillNpuTensorWithConstant(&device_count_tensor, 1); - } - - // HcclAllReduce device_count_tensor - { - void *sendbuff = reinterpret_cast( - const_cast(device_count_tensor.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - 1, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - std::vector device_count_vec(1); - paddle::framework::TensorToVector( - device_count_tensor, ctx.device_context(), &device_count_vec); - device_counts = device_count_vec[0]; - - // HcclAllReduce x_sum - { - void *sendbuff = reinterpret_cast( - const_cast(x_sum.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - // HcclAllReduce x_square_sum - { - void *sendbuff = reinterpret_cast( - const_cast(x_square_sum.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - } - - // cacl saved_mean - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel / device_counts}}; - saved_mean->mutable_data(place); - const auto &runner = - NpuOpRunner("Muls", {x_sum}, {*saved_mean}, attr_input); - runner.Run(stream); - } - - // cacl var_ref - { - phi::DenseTensor saved_mean_square; - { - saved_mean_square.Resize({C}); - saved_mean_square.mutable_data(place); - const auto &runner = - NpuOpRunner("Square", {*saved_mean}, {saved_mean_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tmp; - var_ref_tmp.Resize({C}); - var_ref_tmp.mutable_data(place); - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel / device_counts}}; - const auto &runner = - NpuOpRunner("Muls", {x_square_sum}, {var_ref_tmp}, attr_input); - runner.Run(stream); - } - - // cacl var_ref - { - const auto &runner = NpuOpRunner( - "Sub", {var_ref_tmp, saved_mean_square}, {var_ref}, {}); - runner.Run(stream); - } - } - } - - training_or_inference(ctx, - stream, - place, - layout, - test_mode, - N, - C, - H, - W, - epsilon, - momentum, - saved_mean, - &var_ref, - x, - scale, - bias, - mean, - variance, - mean_out, - variance_out, - saved_mean, - saved_variance, - y); - } - } -}; - -template -class SyncBatchNormNPUGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - float epsilon = ctx.Attr("epsilon"); - const std::string layout_str = ctx.Attr("data_layout"); - const DataLayout layout = phi::StringToDataLayout(layout_str); - - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = - ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - const auto *saved_mean = ctx.Input("SavedMean"); - - const phi::DenseTensor *x; - if (ctx.HasInput("Y")) { - PADDLE_ENFORCE_EQ(true, - false, - platform::errors::InvalidArgument( - "sync_batch_norm_grad doesn't support input Y")); - } else { - x = ctx.Input("X"); - } - - int N, C, H, W, D; - phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D); - - int x_numel = x->numel(); - auto place = ctx.GetPlace(); - auto stream = - ctx.template device_context() - .stream(); - - std::vector axes; - if (layout == phi::DataLayout::kNCHW) { - axes = {0, 2, 3}; - } else if (layout == phi::DataLayout::kNHWC) { - axes = {0, 1, 2}; - } - - std::vector multiples; - if (layout == phi::DataLayout::kNCHW) - multiples = {N, 1, H, W}; - else if (layout == phi::DataLayout::kNHWC) - multiples = {N, H, W, 1}; - - auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); - HcclDataType dtype = platform::ToHCCLDataType( - framework::TransToProtoVarType(scale->dtype())); - - float device_counts = 0.0; - if (comm) { - phi::DenseTensor device_count_tensor; - { - device_count_tensor.Resize({1}); - device_count_tensor.mutable_data(place); - FillNpuTensorWithConstant(&device_count_tensor, 1); - } - - // HcclAllReduce device_count_tensor - { - void *sendbuff = reinterpret_cast( - const_cast(device_count_tensor.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclAllReduce(sendbuff, - recvbuff, - 1, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - std::vector device_count_vec(1); - paddle::framework::TensorToVector( - device_count_tensor, ctx.device_context(), &device_count_vec); - device_counts = device_count_vec[0]; - PADDLE_ENFORCE_GE( - device_counts, - 2, - platform::errors::PreconditionNotMet("device_counts should >= 2.")); - } - - // cacl var_ref - phi::DenseTensor var_ref; - var_ref.Resize({C}); - var_ref.mutable_data(place); - { - // cacl var_ref - { - phi::DenseTensor x_square; - { - x_square.Resize(x->dims()); - x_square.mutable_data(place); - const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - x_square_sum.Resize({C}); - x_square_sum.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_square_sum_mean; - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f * C / x_numel}}; - x_square_sum_mean.Resize({C}); - x_square_sum_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Muls", {x_square_sum}, {x_square_sum_mean}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor mean_square; - { - mean_square.Resize({C}); - mean_square.mutable_data(place); - const auto &runner = - NpuOpRunner("Square", {*saved_mean}, {mean_square}, {}); - runner.Run(stream); - } - - // cacl var_ref - { - const auto &runner = NpuOpRunner( - "Sub", {x_square_sum_mean, mean_square}, {var_ref}, {}); - runner.Run(stream); - } - } - } - - phi::DenseTensor saved_mean_tile_1; - { - saved_mean_tile_1.Resize({C}); - saved_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - saved_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - saved_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor saved_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - saved_mean_tile.Resize(x->dims()); - saved_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner( - "TileD", {saved_mean_tile_1}, {saved_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor x_sub_saved_mean; - { - x_sub_saved_mean.Resize(x->dims()); - x_sub_saved_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*x, saved_mean_tile}, {x_sub_saved_mean}, {}); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_1; - { - var_ref_tile_1.Resize({C}); - var_ref_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1); - if (layout == phi::DataLayout::kNCHW) - var_ref_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - var_ref_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor var_ref_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - var_ref_tile.Resize(x->dims()); - var_ref_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {var_ref_tile_1}, {var_ref_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_add_epsilon; - { - framework::NPUAttributeMap attr_input = {{"value", epsilon}}; - var_ref_tile_add_epsilon.Resize(x->dims()); - var_ref_tile_add_epsilon.mutable_data(place); - const auto &runner = NpuOpRunner( - "Adds", {var_ref_tile}, {var_ref_tile_add_epsilon}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor var_ref_tile_add_epsilon_sqrt; - { - var_ref_tile_add_epsilon_sqrt.Resize(x->dims()); - var_ref_tile_add_epsilon_sqrt.mutable_data(place); - const auto &runner = NpuOpRunner("Sqrt", - {var_ref_tile_add_epsilon}, - {var_ref_tile_add_epsilon_sqrt}, - {}); - runner.Run(stream); - } - - phi::DenseTensor dy_mul_x_sub_mean_for_scale; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_mul_x_sub_mean_for_scale.Resize(x->dims()); - dy_mul_x_sub_mean_for_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {}); - runner.Run(stream); - } else { - dy_mul_x_sub_mean_for_scale.Resize(x->dims()); - dy_mul_x_sub_mean_for_scale.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {}); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mul_x_sub_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_mul_x_sub_mean.Resize(x->dims()); - dy_mul_x_sub_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {}); - runner.Run(stream); - } else { - dy_mul_x_sub_mean.Resize(x->dims()); - dy_mul_x_sub_mean.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {}); - runner.Run(stream); - } - } - - // HcclAllReduce dy_mul_x_sub_mean - if (comm) { - { - void *sendbuff = reinterpret_cast( - const_cast(dy_mul_x_sub_mean.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::HcclAllReduce(sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f / device_counts}}; - const auto &runner = NpuOpRunner( - "Muls", {dy_mul_x_sub_mean}, {dy_mul_x_sub_mean}, attr_input); - runner.Run(stream); - } - } - - // cacl d_x - if (d_x) { - phi::DenseTensor dy_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mean.Resize({C}); - dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); - runner.Run(stream); - } else { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mean.Resize({C}); - dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); - runner.Run(stream); - } - } - - // HcclAllReduce dy_mean - if (comm) { - { - void *sendbuff = reinterpret_cast( - const_cast(dy_mean.data())); - void *recvbuff = sendbuff; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( - sendbuff, - recvbuff, - C, - dtype, - HCCL_REDUCE_SUM, - comm->comm(), - reinterpret_cast(stream))); - } - - { - framework::NPUAttributeMap attr_input = { - {"value", 1.0f / device_counts}}; - const auto &runner = - NpuOpRunner("Muls", {dy_mean}, {dy_mean}, attr_input); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mean_tile_1; - { - dy_mean_tile_1.Resize({C}); - dy_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - dy_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - dy_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor dy_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - dy_mean_tile.Resize(x->dims()); - dy_mean_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {dy_mean_tile_1}, {dy_mean_tile}, attr_input); - runner.Run(stream); - } - - phi::DenseTensor dy_sub_dy_mean; - { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - dy_sub_dy_mean.Resize(x->dims()); - dy_sub_dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); - runner.Run(stream); - } else { - dy_sub_dy_mean.Resize(x->dims()); - dy_sub_dy_mean.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); - runner.Run(stream); - } - } - - phi::DenseTensor dy_mul_x_sub_mean_mean; - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - dy_mul_x_sub_mean_mean.Resize({C}); - dy_mul_x_sub_mean_mean.mutable_data(place); - const auto &runner = NpuOpRunner("ReduceMeanD", - {dy_mul_x_sub_mean}, - {dy_mul_x_sub_mean_mean}, - attr_input); - runner.Run(stream); - } - - phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1; - { - dy_mul_x_sub_mean_mean_tile_1.Resize({C}); - dy_mul_x_sub_mean_mean_tile_1.mutable_data(place); - paddle::framework::TensorCopySync( - dy_mul_x_sub_mean_mean, place, &dy_mul_x_sub_mean_mean_tile_1); - if (layout == phi::DataLayout::kNCHW) - dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor dy_mul_x_sub_mean_mean_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - dy_mul_x_sub_mean_mean_tile.Resize(x->dims()); - dy_mul_x_sub_mean_mean_tile.mutable_data(place); - const auto &runner = NpuOpRunner("TileD", - {dy_mul_x_sub_mean_mean_tile_1}, - {dy_mul_x_sub_mean_mean_tile}, - attr_input); - runner.Run(stream); - } - - // (x - mean) * np.mean(dy * (x - mean), axis=axis) - // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile - phi::DenseTensor tmp1; - { - tmp1.Resize(x->dims()); - tmp1.mutable_data(place); - const auto &runner = NpuOpRunner( - "Mul", {x_sub_saved_mean, dy_mul_x_sub_mean_mean_tile}, {tmp1}, {}); - runner.Run(stream); - } - - // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon) - // tmp1 / (var + epsilon) - // tmp1 / var_ref_tile_add_epsilon - phi::DenseTensor tmp2; - { - tmp2.Resize(x->dims()); - tmp2.mutable_data(place); - const auto &runner = - NpuOpRunner("Div", {tmp1, var_ref_tile_add_epsilon}, {tmp2}, {}); - runner.Run(stream); - } - - // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) / - // (var + epsilon) - // dy_sub_dy_mean - tmp2 - phi::DenseTensor tmp3; - { - tmp3.Resize(x->dims()); - tmp3.mutable_data(place); - const auto &runner = - NpuOpRunner("Sub", {dy_sub_dy_mean, tmp2}, {tmp3}, {}); - runner.Run(stream); - } - - phi::DenseTensor scale_tile_1; - { - scale_tile_1.Resize({C}); - scale_tile_1.mutable_data(place); - paddle::framework::TensorCopySync(*scale, place, &scale_tile_1); - if (layout == phi::DataLayout::kNCHW) - scale_tile_1.Resize({1, C, 1, 1}); - else if (layout == phi::DataLayout::kNHWC) - scale_tile_1.Resize({1, 1, 1, C}); - } - - phi::DenseTensor scale_tile; - { - framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; - scale_tile.Resize(x->dims()); - scale_tile.mutable_data(place); - const auto &runner = - NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); - runner.Run(stream); - } - - // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), - // axis) / (var + epsilon)) - // scale * tmp3 - phi::DenseTensor dx_1; - { - dx_1.Resize(x->dims()); - dx_1.mutable_data(place); - - const auto &runner = NpuOpRunner("Mul", {scale_tile, tmp3}, {dx_1}, {}); - runner.Run(stream); - } - - // dx_1 / var_ref_tile_add_epsilon_sqrt - { - d_x->Resize(x->dims()); - d_x->mutable_data(place); - const auto &runner = NpuOpRunner( - "Div", {dx_1, var_ref_tile_add_epsilon_sqrt}, {*d_x}, {}); - runner.Run(stream); - } - } - - // cacl d_scale - if (d_scale) { - phi::DenseTensor d_scale_2; - { - d_scale_2.Resize(x->dims()); - d_scale_2.mutable_data(place); - const auto &runner = NpuOpRunner( - "Div", - {dy_mul_x_sub_mean_for_scale, var_ref_tile_add_epsilon_sqrt}, - {d_scale_2}, - {}); - runner.Run(stream); - } - - { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_scale->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {d_scale_2}, {*d_scale}, attr_input); - runner.Run(stream); - } - } - - // cacl d_bias - if (d_bias) { - if (framework::TransToProtoVarType(d_y->dtype()) == - framework::proto::VarType::FP16) { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_bias->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); - runner.Run(stream); - } else { - framework::NPUAttributeMap attr_input = {{"keep_dims", false}, - {"axes", axes}}; - d_bias->mutable_data(place); - const auto &runner = - NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); - runner.Run(stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - sync_batch_norm, - ops::SyncBatchNormNPUKernel); -REGISTER_OP_NPU_KERNEL( - sync_batch_norm_grad, - ops::SyncBatchNormNPUGradKernel); diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc deleted file mode 100644 index ce10caf1b2e19..0000000000000 --- a/paddle/fluid/operators/take_along_axis_op_npu.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in -// cmake/operators.cmake when Paddle supports -#if (CANN_VERSION_CODE >= 504000) - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class NPUTakeAlongAxisKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner( - "GatherElements", {*input, *index}, {*result}, {{"dim", axis}}); - runner.Run(stream); - } -}; - -template -class NPUTakeAlongAxisGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result_grad = - ctx.Input(framework::GradVarName("Result")); - - auto input_grad = - ctx.Output(framework::GradVarName("Input")); - input_grad->mutable_data(ctx.GetPlace()); - - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner("ScatterAddWithAxis", - {*input_grad, *index, *result_grad}, - {*input_grad}, - {{"axis", axis}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - take_along_axis, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel, - ops::NPUTakeAlongAxisKernel) -REGISTER_OP_NPU_KERNEL( - take_along_axis_grad, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel, - ops::NPUTakeAlongAxisGradKernel) - -#endif diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc deleted file mode 100644 index 2e3ab9dac0461..0000000000000 --- a/paddle/fluid/operators/tile_op_npu.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/tile_op_functor.h" - -namespace paddle { -namespace operators { - -using NPUDeviceContext = platform::NPUDeviceContext; - -template -class TileNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, - 1, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, - 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - repeat_times_size)); - rank = std::max(rank, repeat_times_size); - Tile(context); - } - - protected: - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], - 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), - vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), - repeat_times.size())); - auto* out0 = context.Output("Out"); - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - out0->mutable_data(context.GetPlace()); - - std::vector temp(repeat_times.size(), 1); - if (repeat_times == temp) { - framework::TensorCopy(*in0, - context.GetPlace(), - context.template device_context(), - out0); - return; - } - - // const auto& runner = - // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = context.template device_context().stream(); - NpuOpRunner runner; - runner.SetType("Tile") - .AddInput(*in0) - .AddInput(std::move(repeat_times)) - .AddOutput(*out0) - .Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(tile, - ops::TileNPUKernel, - ops::TileNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TileNPUKernel, -#endif - ops::TileNPUKernel, - ops::TileNPUKernel); diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc deleted file mode 100644 index 478523721458d..0000000000000 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/operators/top_k_op.h" - -namespace paddle { -namespace operators { - -void gen_assist_seq(phi::DenseTensor* assit_tensor, - int64_t dim, - const framework::ExecutionContext& ctx) { - const int64_t dimx2 = dim; - std::vector assit; - assit.resize(2 * dimx2); - for (int64_t i = 0; i < dimx2; i++) { - // for i in range [0, dim] - assit[i] = static_cast(i); - - // for i in range [dim, dimx2] - int64_t idx = - static_cast(static_cast(i)); - int64_t gap = i - idx; - assit[i + dim] = static_cast(gap); - } - framework::TensorFromVector(assit, ctx.device_context(), assit_tensor); -} - -template -class TopkNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // read input - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - size_t k = static_cast(ctx.Attr("k")); - - output->mutable_data(ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - // prepare assit - auto size = input->dims().size(); - // dim is the last dimension of input - auto dim = input->dims()[size - 1]; - phi::DenseTensor assist_seq_tensor; - assist_seq_tensor.Resize({2 * dim}); - assist_seq_tensor.mutable_data(ctx.GetPlace()); - gen_assist_seq(&assist_seq_tensor, dim, ctx); - - framework::NPUAttributeMap attr_input = {{"sorted", "true"}, - {"k", static_cast(k)}, - {"dim", -1}, - {"largest", true}}; - - phi::DenseTensor tmp_indices(phi::DataType::INT32); - tmp_indices.Resize(indices->dims()); - tmp_indices.mutable_data(ctx.GetPlace()); - - // run ascend - const auto& runner = NpuOpRunner("TopKD", - {*input, assist_seq_tensor}, - {*output, tmp_indices}, - attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - - // cast indices from INT32 to INT64 - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(indices->dtype())); - const auto& runner_cast_indices = - NpuOpRunner("Cast", - {tmp_indices}, - {*indices}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_indices.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// Ascend Op TopKD only support input float 16 dtype -REGISTER_OP_NPU_KERNEL(top_k, - ops::TopkNPUKernel); diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc deleted file mode 100644 index 4e0b0650b9af6..0000000000000 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { -// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel -// may lead to large accuracy error for float32 data -template -class TopkV2NPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* k_tensor = context.Input("K"); - auto* out = context.Output("Out"); - auto* indices = context.Output("Indices"); // type: INT64 - - int32_t k = static_cast(context.Attr("k")); - int axis = static_cast(context.Attr("axis")); - const bool sorted = static_cast(context.Attr("sorted")); - const bool largest = static_cast(context.Attr("largest")); - - if (axis < 0) { - axis += input->dims().size(); - } - - if (k_tensor != nullptr) { - std::vector v_tmp(1); - paddle::framework::TensorToVector( - *k_tensor, - context.template device_context(), - &v_tmp); - k = static_cast(v_tmp[0]); - } - - framework::DDim output_dims = input->dims(); - output_dims[axis] = k; - - out->Resize(output_dims); - indices->Resize(output_dims); - - out->mutable_data(context.GetPlace()); - indices->mutable_data(context.GetPlace()); - - phi::DenseTensor indices_int32(phi::DataType::INT32); - indices_int32.Resize(output_dims); - indices_int32.mutable_data(context.GetPlace()); - - auto npu_stream = - context.template device_context() - .stream(); - - NpuOpRunner npu_op_runner_topkv2; - npu_op_runner_topkv2.SetType("TopKV2") - .AddInput(*input) - .AddInput(std::vector{k}) - .AddOutput(*out) - .AddOutput(indices_int32) - .AddAttr("sorted", sorted) - .AddAttr("dim", axis) - .AddAttr("largest", largest) - .Run(npu_stream); - - // Cast 'indices_int32' to 'indices', from INT32 to INT64 - auto dst_dtype = - ConvertToNpuDtype(framework::TransToProtoVarType(indices->type())); - const auto& npu_op_runner_cast = - NpuOpRunner("Cast", - {indices_int32}, - {*indices}, - {{"dst_type", static_cast(dst_dtype)}}); - npu_op_runner_cast.Run(npu_stream); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(top_k_v2, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel, - ops::TopkV2NPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc deleted file mode 100644 index 5af2edd60ce8f..0000000000000 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/expand_op.h" - -namespace paddle { -namespace operators { - -template -class TransposeNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - std::vector axis = ctx.Attr>("axis"); - out->mutable_data(ctx.device_context().GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(*x) - .AddInput(std::move(axis)) - .AddOutput(*out); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class TransposeGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - std::vector axis = ctx.Attr>("axis"); - std::vector reversed_axis(axis); - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - x_grad->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("Transpose") - .AddInput(*out_grad) - .AddInput(std::move(reversed_axis)) - .AddOutput(*x_grad); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - transpose2, - ops::TransposeNPUKernel, - ops::TransposeNPUKernel, - ops::TransposeNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TransposeNPUKernel, -#endif - ops::TransposeNPUKernel, - ops::TransposeNPUKernel); - -REGISTER_OP_NPU_KERNEL(transpose2_grad, - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel, -#ifdef PADDLE_WITH_ASCEND_INT64 - ops::TransposeGradNPUKernel, -#endif - ops::TransposeGradNPUKernel, - ops::TransposeGradNPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc deleted file mode 100644 index 0ef5af349decf..0000000000000 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP_ITSELF(transpose2); -USE_OP_DEVICE_KERNEL(transpose2, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto out = scope->Var("Out"); - auto xshape = scope->Var("XShape"); - auto* x_t = x->GetMutable(); - auto* out_t = out->GetMutable(); - auto* xshape_t = xshape->GetMutable(); - auto place = ctx.GetPlace(); - - int dim0 = 2; - int dim1 = 3; - paddle::framework::TensorFromVector( - std::vector({0, 1, 2, 3, 4, 5}), ctx, x_t); - ctx.Wait(); - x_t->Resize({dim0, dim1}); - out_t->Resize({dim0, dim1}); - ctx.Wait(); - out_t->mutable_data(place); - ctx.Wait(); - xshape_t->Resize({dim0, dim1}); - xshape_t->mutable_data(place); - f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, - {"data_format", std::string("AnyLayout")}}; - auto op = f::OpRegistry::CreateOp("transpose2", - {{"X", {"X"}}}, - {{"Out", {"Out"}}, {"XShape", {"XShape"}}}, - attrs); - ctx.Wait(); - op->Run(*scope, place); - ctx.Wait(); - std::vector out_v; - paddle::framework::TensorToVector(*out_t, ctx, &out_v); - ctx.Wait(); - - EXPECT_EQ(out_t->numel(), dim0 * dim1); - EXPECT_EQ(out_v[0], 0); - EXPECT_EQ(out_v[1], 3); - EXPECT_EQ(out_v[2], 1); - EXPECT_EQ(out_v[3], 4); - EXPECT_EQ(out_v[4], 2); - EXPECT_EQ(out_v[5], 5); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto xshape = scope->Var("XShape"); - auto x_grad = scope->Var("X@GRAD"); - auto out_grad = scope->Var("Out@GRAD"); - - auto* x_grad_t = x_grad->GetMutable(); - auto* xshape_t = xshape->GetMutable(); - auto* out_grad_t = out_grad->GetMutable(); - - int dim0 = 2; - int dim1 = 3; - auto place = ctx.GetPlace(); - - paddle::framework::TensorFromVector( - std::vector({0, 1, 2, 3, 4, 5}), ctx, out_grad_t); - ctx.Wait(); - - x_grad_t->Resize({dim0, dim1}); - xshape_t->Resize( - {0, - dim0, - dim1}); // NOTE(zhiqiu): 0 is needed, see its infershape function - out_grad_t->Resize({dim0, dim1}); - - f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, - {"data_format", std::string("AnyLayout")}}; - - auto op = f::OpRegistry::CreateOp( - "transpose2_grad", - {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}}, - {{"X@GRAD", {"X@GRAD"}}}, - attrs); - - op->Run(*scope, place); - ctx.Wait(); - std::vector out_v; - paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v); - ctx.Wait(); - - EXPECT_EQ(x_grad_t->numel(), dim0 * dim1); - EXPECT_EQ(out_v[0], 0); - EXPECT_EQ(out_v[1], 3); - EXPECT_EQ(out_v[2], 1); - EXPECT_EQ(out_v[3], 4); - EXPECT_EQ(out_v[4], 2); - EXPECT_EQ(out_v[5], 5); -} - -TEST(transpose2, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} - -TEST(transpose2_grad, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - CompareGrad(&scope, *ctx); -} diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc deleted file mode 100644 index b47797a5bb131..0000000000000 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - int diagonal = ctx.Attr("diagonal"); - bool lower = ctx.Attr("lower"); - - out->mutable_data(ctx.GetPlace()); - - std::string op_type = lower ? "Tril" : "Triu"; - - framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}}; - - const auto& dev_ctx = - ctx.template device_context(); - - auto op_func_tril = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - auto op_func_triu = [](const std::vector& inputs, - const std::vector& outputs, - const NPUAttributeMap& attrs, - const platform::NPUDeviceContext& dev_ctx) { - const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs); - runner.Run(dev_ctx.stream()); - }; - - if (framework::TransToProtoVarType(x->dtype()) == - framework::proto::VarType::BOOL) { - if (lower) { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attr_input, - dev_ctx, - op_func_tril, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } else { - NpuOpRunner::TypeAdapter({*x}, - {*out}, - attr_input, - dev_ctx, - op_func_triu, - {framework::proto::VarType::UINT8}, - {framework::proto::VarType::UINT8}); - } - } else { - const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input); - runner.Run(dev_ctx.stream()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - tril_triu, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel, - ops::TrilTriuNPUKernel); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc deleted file mode 100644 index da9fa93130bd1..0000000000000 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" - -namespace paddle { -namespace operators { - -template -class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal - std::vector shape = ctx.Attr>("shape"); - phi::DenseTensor shape_tensor(phi::DataType::INT32); - shape_tensor.mutable_data({static_cast(shape.size())}, - ctx.GetPlace()); - paddle::framework::TensorFromVector( - shape, ctx.device_context(), &shape_tensor); - float mean = ctx.Attr("mean"); - phi::DenseTensor mean_tensor(phi::DataType::FLOAT32); - mean_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&mean_tensor, mean); - - float std = ctx.Attr("std"); - phi::DenseTensor std_tensor(phi::DataType::FLOAT32); - std_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&std_tensor, std); - - int32_t seed_var = ctx.Attr("seed"); - - phi::DenseTensor min_tensor(phi::DataType::FLOAT32); - min_tensor.mutable_data({1}, ctx.GetPlace()); - float min_value = mean - std * 2.0; - FillNpuTensorWithConstant(&min_tensor, min_value); - - phi::DenseTensor max_tensor(phi::DataType::FLOAT32); - max_tensor.mutable_data({1}, ctx.GetPlace()); - float max_value = mean + std * 2.0; - FillNpuTensorWithConstant(&max_tensor, max_value); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); - const auto& runner = NpuOpRunner( - "ParameterizedTruncatedNormal", - {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, - {*out}, - {{"seed", seed_var}}); - runner.Run(stream); - } -}; - -// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the -// above -// npu version work in the future. -template -class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - tensor->mutable_data(context.GetPlace()); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); - TruncatedNormal truncated_normal(mean, std); - int64_t size = tensor->numel(); - - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - cpu_data[i] = truncated_normal(dist(*engine)); - } - framework::TensorCopy( - cpu_tensor, - context.GetPlace(), - context.template device_context(), - tensor); - context.template device_context() - .Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL(truncated_gaussian_random, - ops::NPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc deleted file mode 100644 index 5958a7751b8be..0000000000000 --- a/paddle/fluid/operators/uniform_random_op_npu.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/uniform_random_op.h" -#include "paddle/phi/core/generator.h" - -namespace paddle { -namespace operators { - -template -class NPUUniformRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - phi::DenseTensor *tensor = nullptr; - auto out_var = ctx.OutputVar("Out"); - std::vector new_shape; - auto list_new_shape_tensor = - ctx.MultiInput("ShapeTensorList"); - if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) { - if (ctx.HasInput("ShapeTensor")) { - auto *shape_tensor = ctx.Input("ShapeTensor"); - new_shape = GetNewDataFromShapeTensor(shape_tensor); - } else if (list_new_shape_tensor.size() > 0) { - new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor); - } - } - - if (out_var->IsType()) { - auto *selected_rows = out_var->GetMutable(); - tensor = selected_rows->mutable_value(); - auto shape = ctx.Attr>("shape"); - if (!new_shape.empty()) shape = new_shape; - tensor->Resize(phi::make_ddim(shape)); - selected_rows->mutable_rows()->reserve(shape[0]); - } else if (out_var->IsType()) { - tensor = out_var->GetMutable(); - if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Expected type of Output(out) in uniform_random_op must be " - "phi::DenseTensor, " - "SelectedRows. But got " - "unsupport type: %s.", - framework::ToTypeName(out_var->Type()))); - } - tensor->mutable_data(ctx.GetPlace()); - int64_t size = tensor->numel(); - - phi::DenseTensor cpu_tensor(tensor->dtype()); - cpu_tensor.Resize(tensor->dims()); - T *data_cpu = cpu_tensor.mutable_data(platform::CPUPlace()); - - std::uniform_real_distribution dist( - static_cast(ctx.Attr("min")), - static_cast(ctx.Attr("max"))); - unsigned int seed = static_cast(ctx.Attr("seed")); - auto engine = phi::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data_cpu[i] = dist(*engine); - } - - unsigned int diag_num = - static_cast(ctx.Attr("diag_num")); - unsigned int diag_step = - static_cast(ctx.Attr("diag_step")); - auto diag_val = static_cast(ctx.Attr("diag_val")); - if (diag_num > 0) { - PADDLE_ENFORCE_GT( - size, - (diag_num - 1) * (diag_step + 1), - platform::errors::InvalidArgument( - "ShapeInvalid: the diagonal's elements is equal (num-1) " - "* (step-1) with num %d, step %d," - "It should be smaller than %d, but received %d", - diag_num, - diag_step, - (diag_num - 1) * (diag_step + 1), - size)); - for (int64_t i = 0; i < diag_num; ++i) { - int64_t pos = i * diag_step + i; - data_cpu[pos] = diag_val; - } - } - - // copy to NPU - framework::TensorCopy( - cpu_tensor, - ctx.GetPlace(), - ctx.template device_context(), - tensor); - ctx.template device_context().Wait(); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_NPU_KERNEL(uniform_random, - paddle::operators::NPUUniformRandomKernel); diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc deleted file mode 100644 index b2b09faaa9d44..0000000000000 --- a/paddle/fluid/operators/unsqueeze_op_npu.cc +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc deleted file mode 100644 index bf66941f90278..0000000000000 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; - -USE_OP(unsqueeze); -USE_OP_DEVICE_KERNEL(unsqueeze, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto x = scope->Var("X"); - auto tensor_x = x->GetMutable(); - - int dim0 = 5; - int dim1 = 10; - - std::vector init; - for (int64_t i = 0; i < dim0 * dim1; ++i) { - init.push_back(static_cast(0.1)); - } - - paddle::framework::TensorFromVector(init, ctx, tensor_x); - tensor_x->Resize({dim0, dim1}); - - ctx.Wait(); - - // run - auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); - auto tensor_out = out->GetMutable(); - - std::vector axis; - axis.push_back(1); - f::AttributeMap attrs = {{"axes", axis}}; - - auto op = f::OpRegistry::CreateOp( - "unsqueeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); - - op->Run(*scope, place); - ctx.Wait(); - - EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3)); - EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5)); - EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1)); - EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10)); - - std::vector out_vec; - paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], static_cast(0.1)); - } - - ctx.Wait(); -} - -TEST(unsqueeze, NPU_fp32) { - f::Scope scope; - auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); - Compare(&scope, *ctx); -} diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc deleted file mode 100644 index 4c1aa39168b69..0000000000000 --- a/paddle/fluid/operators/unstack_op_npu.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class UnStackNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *dy = ctx.Input("X"); - auto dx = ctx.MultiOutput("Y"); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += dy->dims().size(); - int num = dy->dims()[axis]; - - auto stream = - ctx.template device_context() - .stream(); - - std::vector dx_list; - for (int i = 0; i < num; i++) { - dx[i]->mutable_data(ctx.GetPlace()); - dx_list.push_back(*dx[i]); - } - - const auto &runner = - NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); - runner.Run(stream); - } -}; - -template -class UnStackGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto x = ctx.MultiInput(framework::GradVarName("Y")); - auto *y = ctx.Output(framework::GradVarName("X")); - int axis = ctx.Attr("axis"); - if (axis < 0) axis += (x[0]->dims().size() + 1); - int num = static_cast(x.size()); - - auto stream = - ctx.template device_context() - .stream(); - - std::vector x_list; - for (int i = 0; i < num; i++) { - x_list.push_back(*x[i]); - } - y->mutable_data(ctx.GetPlace()); - - const auto &runner = - NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - unstack, - ops::UnStackNPUKernel, - ops::UnStackNPUKernel); - -REGISTER_OP_NPU_KERNEL( - unstack_grad, - ops::UnStackGradNPUKernel, - ops::UnStackGradNPUKernel); diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc deleted file mode 100644 index b5c61e6b988aa..0000000000000 --- a/paddle/fluid/operators/where_index_op_npu.cc +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class NPUWhereIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - auto* condition = context.Input("Condition"); - auto* out = context.Output("Out"); - - auto dims = condition->dims(); - const int rank = dims.size(); - - auto place = context.GetPlace(); - const aclrtStream& stream = dev_ctx.stream(); - - // Run Cast and ReduceSum to get 0 dim of Out - phi::DenseTensor booled_cond; - if (framework::TransToProtoVarType(condition->dtype()) != - framework::proto::VarType::BOOL) { - auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL); - booled_cond.mutable_data(dims, place); - const auto& booled_runner = - NpuOpRunner("Cast", - {*condition}, - {booled_cond}, - {{"dst_type", static_cast(bool_type)}}); - booled_runner.Run(stream); - } else { - booled_cond.ShareDataWith(*condition); - } - phi::DenseTensor casted_cond; - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64); - casted_cond.mutable_data(dims, place); - const auto& cast_runner = - NpuOpRunner("Cast", - {booled_cond}, - {casted_cond}, - {{"dst_type", static_cast(dst_dtype)}}); - cast_runner.Run(stream); - - phi::DenseTensor sumed_true_num; - sumed_true_num.mutable_data({1}, place); - phi::DenseTensor cond_axes; - cond_axes.mutable_data({dims.size()}, place); - std::vector axes_vec; - for (int i = 0; i < dims.size(); ++i) { - axes_vec.push_back(i); - } - framework::TensorFromVector(axes_vec, dev_ctx, &cond_axes); - const auto& sum_runner = NpuOpRunner("ReduceSum", - {casted_cond, cond_axes}, - {sumed_true_num}, - {{"keep_dims", false}}); - sum_runner.Run(stream); - - phi::DenseTensor local_true_num; - paddle::framework::TensorCopySync( - sumed_true_num, platform::CPUPlace(), &local_true_num); - auto true_num = *local_true_num.data(); - - out->Resize(phi::make_ddim({true_num, rank})); - out->mutable_data(place); - - if (true_num == 0) { - return; - } - - out->set_layout(DataLayout::kAnyLayout); - NpuOpRunner runner{"Where", {*condition}, {*out}}; - runner.Run(stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL(where_index, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel, - ops::NPUWhereIndexKernel); diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc deleted file mode 100644 index e1af771f947bb..0000000000000 --- a/paddle/fluid/operators/where_op_npu.cc +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class WhereNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* condition = ctx.Input("Condition"); - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - const auto& runner = - NpuOpRunner("Select", {*condition, *X, *Y}, {*out}, {}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); - } -}; - -template -class WhereGradNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* condition = ctx.Input("Condition"); - auto* dout_t = ctx.Input(framework::GradVarName("Out")); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - auto* dy_t = ctx.Output(framework::GradVarName("Y")); - - if (dx_t != nullptr) { - dx_t->mutable_data(ctx.GetPlace()); - } - if (dy_t != nullptr) { - dy_t->mutable_data(ctx.GetPlace()); - } - - auto stream = - ctx.template device_context() - .stream(); - - phi::DenseTensor tensor_zeros(dout_t->dtype()); - tensor_zeros.mutable_data(dout_t->dims(), ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {}); - runner.Run(stream); - - if (dx_t != nullptr) { - const auto& runner = NpuOpRunner( - "Select", {*condition, *dout_t, tensor_zeros}, {*dx_t}, {}); - runner.Run(stream); - } - if (dy_t != nullptr) { - const auto& runner = NpuOpRunner( - "Select", {*condition, tensor_zeros, *dout_t}, {*dy_t}, {}); - runner.Run(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_NPU_KERNEL( - where, - ops::WhereNPUKernel, - ops::WhereNPUKernel, - ops::WhereNPUKernel, - ops::WhereNPUKernel); - -REGISTER_OP_NPU_KERNEL( - where_grad, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel, - ops::WhereGradNPUKernel);