From 65420271609b8cce860ec8034569292db7d13d71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Wed, 7 Dec 2022 14:36:34 +0800
Subject: [PATCH] [phi::DenseTensor] Replace Tensor with phi::DenseTensor
 (#48682)

---
 .../fluid/imperative/gradient_accumulator.cc  |  10 +-
 paddle/fluid/operators/abs_op_mlu.cc          |   4 +-
 paddle/fluid/operators/abs_op_npu.cc          |   2 -
 paddle/fluid/operators/activation_op_mlu.cc   |   4 +-
 paddle/fluid/operators/activation_op_npu.cc   | 218 +++++++++---------
 paddle/fluid/operators/affine_grid_op.cc      |   2 -
 .../amp/alloc_float_status_op_npu.cc          |   2 -
 .../amp/check_finite_and_unscale_op_mlu.cc    |   8 +-
 .../amp/check_finite_and_unscale_op_npu.cc    |  10 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   4 +-
 .../amp/clear_float_status_op_npu.cc          |   4 +-
 .../operators/amp/get_float_status_op_npu.cc  |   4 +-
 .../amp/update_loss_scaling_op_npu.cc         |   6 +-
 paddle/fluid/operators/arg_max_op_npu.cc      |   3 +-
 paddle/fluid/operators/arg_min_op_npu.cc      |   1 -
 paddle/fluid/operators/argsort_op_npu.cc      |  35 ++-
 paddle/fluid/operators/attention_lstm_op.cc   |  33 +--
 paddle/fluid/operators/attention_lstm_op.h    |   2 -
 paddle/fluid/operators/batch_norm_op.cc       |  18 +-
 paddle/fluid/operators/batch_norm_op.cu       |   1 -
 paddle/fluid/operators/batch_norm_op.h        |   1 -
 paddle/fluid/operators/batch_norm_op_mlu.cc   |  12 +-
 paddle/fluid/operators/batch_norm_op_npu.cc   |   2 +-
 paddle/fluid/operators/bce_loss_op_mlu.cc     |   2 -
 paddle/fluid/operators/bce_loss_op_npu.cc     |   2 -
 paddle/fluid/operators/cast_op.cc             |   2 +-
 paddle/fluid/operators/cast_op_mlu.cc         |   2 -
 paddle/fluid/operators/cast_op_npu.cc         |   2 -
 paddle/fluid/operators/center_loss_op.h       |   3 +-
 paddle/fluid/operators/clip_by_norm_op.h      |   1 -
 paddle/fluid/operators/clip_by_norm_op_npu.cc |   8 +-
 paddle/fluid/operators/clip_op_mlu.cc         |   8 +-
 paddle/fluid/operators/clip_op_npu.cc         |  10 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   2 +-
 .../operators/collective/c_allreduce_op.h     |   5 +-
 .../c_softmax_with_cross_entropy_op.cu        |  20 +-
 paddle/fluid/operators/concat_op.cc           |   1 -
 paddle/fluid/operators/concat_op_mlu.cc       |   4 +-
 .../operators/controlflow/logical_op_mlu.cc   |   2 -
 .../operators/controlflow/logical_op_npu.cc   |   2 -
 paddle/fluid/operators/conv_op.h              |   2 -
 paddle/fluid/operators/conv_op_mlu.cc         |  33 ++-
 paddle/fluid/operators/conv_op_npu.cc         |  29 +--
 .../fluid/operators/conv_transpose_op_mlu.cc  |  17 +-
 .../fluid/operators/conv_transpose_op_npu.cc  |   9 +-
 paddle/fluid/operators/copy_cross_scope_op.cc |   2 -
 paddle/fluid/operators/correlation_op.cc      |   2 -
 paddle/fluid/operators/cos_sim_op.h           |   6 +-
 paddle/fluid/operators/crop_op_npu.cc         |   6 +-
 paddle/fluid/operators/cross_entropy_op.h     |   6 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 -
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  49 ++--
 paddle/fluid/operators/cumsum_op_mlu.cc       |   4 +-
 paddle/fluid/operators/cumsum_op_npu.cc       |   8 +-
 paddle/fluid/operators/cvm_op.cc              |   2 -
 paddle/fluid/operators/cvm_op.cu              |   1 -
 paddle/fluid/operators/cvm_op.h               |   2 -
 paddle/fluid/operators/data_norm_op.cc        |  13 +-
 paddle/fluid/operators/data_norm_op.cu        |   3 +-
 .../fluid/operators/deformable_conv_op_mlu.cc |  30 ++-
 .../operators/deformable_psroi_pooling_op.cu  |   1 -
 .../operators/deformable_psroi_pooling_op.h   |   4 +-
 .../fluid/operators/detection/bbox_util.cu.h  |  16 +-
 .../operators/detection/bipartite_match_op.cc |   4 +-
 .../fluid/operators/detection/box_clip_op.cu  |   1 -
 .../fluid/operators/detection/box_clip_op.h   |   9 +-
 .../operators/detection/box_coder_op_npu.cc   | 135 +++++------
 .../detection/collect_fpn_proposals_op.cc     |   1 -
 .../detection/collect_fpn_proposals_op.cu     |  26 +--
 .../detection/density_prior_box_op_npu.cc     |  71 +++---
 .../detection/generate_mask_labels_op.cc      |  58 ++---
 .../detection/generate_proposal_labels_op.cc  |  62 ++---
 .../detection/generate_proposals_op.cc        |  42 ++--
 .../detection/generate_proposals_op.cu        |  40 ++--
 .../detection/generate_proposals_v2_op.cc     |   2 -
 .../detection/iou_similarity_op_mlu.cc        |  50 ++--
 .../detection/iou_similarity_op_npu.cc        |  50 ++--
 .../detection/locality_aware_nms_op.cc        |  10 +-
 .../operators/detection/matrix_nms_op.cc      |   2 -
 .../operators/detection/multiclass_nms_op.cc  |  10 +-
 .../detection/polygon_box_transform_op.cc     |   2 -
 .../detection/polygon_box_transform_op.cu     |   1 -
 .../operators/detection/prior_box_op_npu.cc   |   8 +-
 .../retinanet_detection_output_op.cc          |  47 ++--
 .../detection/roi_perspective_transform_op.cc |  14 +-
 .../detection/rpn_target_assign_op.cc         | 138 +++++------
 .../detection/sigmoid_focal_loss_op.cu        |  21 +-
 .../detection/sigmoid_focal_loss_op.h         |  21 +-
 .../operators/detection/yolo_box_op_mlu.cc    |   2 +-
 paddle/fluid/operators/detection_map_op.cc    |   2 -
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |   2 -
 paddle/fluid/operators/dropout_op_mlu.cc      |   8 +-
 paddle/fluid/operators/dropout_op_npu.cc      |  16 +-
 .../elementwise/elementwise_add_op_mlu.cc     |   1 -
 .../elementwise/elementwise_add_op_npu.cc     |   7 +-
 .../elementwise/elementwise_div_op.h          |   1 -
 .../elementwise/elementwise_div_op_mlu.cc     |   8 +-
 .../elementwise/elementwise_div_op_npu.cc     |  22 +-
 .../elementwise_floordiv_op_npu.cc            |   2 -
 .../elementwise/elementwise_max_op_npu.cc     |  18 +-
 .../elementwise/elementwise_min_op_mlu.cc     |   2 -
 .../elementwise/elementwise_min_op_npu.cc     |  16 +-
 .../operators/elementwise/elementwise_mlu.h   |   6 +-
 .../elementwise/elementwise_mod_op_npu.cc     |   4 +-
 .../elementwise/elementwise_mul_op.h          |   1 -
 .../elementwise/elementwise_mul_op_mlu.cc     |   5 +-
 .../elementwise/elementwise_mul_op_npu.cc     |   9 +-
 .../operators/elementwise/elementwise_npu.h   |   9 +-
 .../operators/elementwise/elementwise_op.h    |   6 -
 .../elementwise/elementwise_pow_op_mlu.cc     |  12 +-
 .../elementwise/elementwise_pow_op_npu.cc     |  32 ++-
 .../elementwise/elementwise_sub_op_mlu.cc     |   2 -
 .../elementwise/elementwise_sub_op_npu.cc     |   8 +-
 paddle/fluid/operators/expand_as_op.h         |   1 -
 paddle/fluid/operators/expand_as_v2_op.h      |   1 -
 paddle/fluid/operators/expand_as_v2_op_mlu.cc |   2 -
 paddle/fluid/operators/expand_op.h            |   1 -
 paddle/fluid/operators/expand_v2_op_npu.cc    |   9 +-
 paddle/fluid/operators/eye_op_npu.cc          |   2 -
 paddle/fluid/operators/fc_op.h                |   1 -
 .../fill_constant_batch_size_like_op_npu.cc   |   4 +-
 .../fluid/operators/fill_constant_op_mlu.cc   |   3 +-
 paddle/fluid/operators/filter_by_instag_op.cu |   1 -
 paddle/fluid/operators/filter_by_instag_op.h  |   1 -
 paddle/fluid/operators/flatten_op.cc          |   2 -
 paddle/fluid/operators/flatten_op_npu.cc      |   2 -
 paddle/fluid/operators/fsp_op.h               |   2 -
 paddle/fluid/operators/fused/attn_gemm.h      |   1 -
 paddle/fluid/operators/fused/attn_gemm_int8.h |   1 -
 .../fluid/operators/fused/conv_fusion_op.cu   |  10 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc | 171 +++++++-------
 .../fused/cudnn_bn_stats_finalize.cu.h        |  21 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |  21 +-
 .../operators/fused/cudnn_norm_conv_test.cc   |  13 +-
 .../fused/cudnn_scale_bias_add_relu.cu.h      |  39 ++--
 paddle/fluid/operators/fused/fmha_ref.h       |   2 -
 .../operators/fused/fused_attention_op.cc     |   2 -
 .../operators/fused/fused_attention_op.cu     |   8 +-
 .../operators/fused/fused_attention_op_xpu.cc | 157 +++++++------
 ...sed_bias_dropout_residual_layer_norm_op.cc |   2 -
 ...sed_bias_dropout_residual_layer_norm_op.cu |   2 -
 .../operators/fused/fused_bn_activation_op.cc |   6 +-
 .../operators/fused/fused_bn_activation_op.cu |   5 +-
 .../operators/fused/fused_bn_activation_op.h  |   1 -
 .../fused/fused_bn_add_activation_op.cc       |   6 +-
 .../fused/fused_bn_add_activation_op.cu       |   5 +-
 .../fused/fused_bn_add_activation_op.h        |   1 -
 .../fused_embedding_eltwise_layernorm_op.cu   |   1 -
 .../fused/fused_embedding_fc_lstm_op.cc       |  21 +-
 .../fused/fused_embedding_fc_lstm_op.h        |   2 -
 .../fused/fused_embedding_seq_pool_op.h       |   5 +-
 .../operators/fused/fused_feedforward_op.cc   |   1 -
 .../operators/fused/fused_feedforward_op.cu   |   2 -
 .../fused/fused_feedforward_op_xpu.cc         | 194 ++++++++--------
 .../operators/fused/fused_gate_attention.h    |  36 ++-
 .../fused/fused_gate_attention_op.cc          |   1 -
 .../fused/fused_gate_attention_op.cu          |  93 ++++----
 .../operators/fused/fused_gemm_epilogue_op.cc |   1 -
 .../operators/fused/fused_gemm_epilogue_op.cu |   2 -
 .../fused/fused_gemm_epilogue_op_xpu.cc       |   2 -
 .../fused/fused_multi_transformer_int8_op.cc  |   4 +-
 .../fused/fused_multi_transformer_int8_op.cu  |  38 +--
 .../fused/fused_multi_transformer_op.cc       |   4 +-
 .../fused/fused_multi_transformer_op.cu       |  92 ++++----
 .../fused/fused_multi_transformer_op.cu.h     |  12 +-
 .../fused/fusion_conv_inception_op.cu         |   1 -
 paddle/fluid/operators/fused/fusion_gru_op.cc |  29 +--
 paddle/fluid/operators/fused/fusion_gru_op.h  |   2 -
 .../fluid/operators/fused/fusion_lstm_op.cc   |  21 +-
 paddle/fluid/operators/fused/fusion_lstm_op.h |   2 -
 .../fused/fusion_repeated_fc_relu_op.cc       |   8 +-
 .../fused/fusion_repeated_fc_relu_op.h        |   2 -
 .../fused/fusion_seqconv_eltadd_relu_op.cc    |  20 +-
 .../fused/fusion_seqconv_eltadd_relu_op.h     |   2 -
 .../fused/fusion_seqexpand_concat_fc_op.cc    |   7 +-
 .../fused/fusion_seqexpand_concat_fc_op.h     |   2 -
 .../fused/fusion_seqpool_concat_op.h          |   2 -
 .../fused/fusion_seqpool_cvm_concat_op.cc     |   3 +-
 .../fused/fusion_seqpool_cvm_concat_op.h      |   2 -
 .../fused/fusion_squared_mat_sub_op.cc        |  12 +-
 .../fused/fusion_squared_mat_sub_op.h         |   2 -
 .../operators/fused/multihead_matmul_op.cu    |  11 +-
 .../operators/fused/resnet_basic_block_op.cc  |   1 -
 .../fused/resnet_basic_block_op_xpu.cc        |   2 -
 .../fluid/operators/fused/resnet_unit_op.cc   |   2 -
 .../fluid/operators/fused/resnet_unit_op.cu   | 133 ++++++-----
 .../operators/fused/resnet_unit_op_xpu.cc     | 101 ++++----
 .../operators/fused/skip_layernorm_op.cu      |   1 -
 .../fused/xpu_fused_common_function.h         |  15 +-
 .../fluid/operators/fused/yolo_box_head_op.cu |   1 -
 .../fluid/operators/fused/yolo_box_post_op.cu |   1 -
 paddle/fluid/operators/gather_nd_op_mlu.cc    |   2 -
 paddle/fluid/operators/gather_nd_op_npu.cc    |   1 -
 .../fluid/operators/gather_scatter_kernel.cc  |  24 +-
 .../fluid/operators/gather_scatter_kernel.cu  |  26 +--
 .../fluid/operators/gather_scatter_kernel.h   |  50 ++--
 paddle/fluid/operators/gaussian_random_op.cc  |   2 -
 .../fluid/operators/gaussian_random_op_mlu.cc |   3 +-
 .../fluid/operators/gaussian_random_op_npu.cc |   3 +-
 paddle/fluid/operators/gelu_op_npu.cc         |   2 -
 .../fluid/operators/graph_khop_sampler_op.cu  |   2 -
 .../fluid/operators/graph_khop_sampler_op.h   |   2 -
 paddle/fluid/operators/grid_sampler_op_mlu.cc |   6 +-
 paddle/fluid/operators/group_norm_op.cc       |  15 +-
 paddle/fluid/operators/group_norm_op.cu       |  10 +-
 paddle/fluid/operators/group_norm_op.h        |   1 -
 paddle/fluid/operators/group_norm_op_npu.cc   |  47 ++--
 paddle/fluid/operators/gru_op.cc              |  14 +-
 paddle/fluid/operators/gru_op.cu.cc           |   9 +-
 paddle/fluid/operators/gru_op.h               |  21 +-
 paddle/fluid/operators/gru_unit_op.h          |   6 +-
 paddle/fluid/operators/huber_loss_op_mlu.cc   |  26 +--
 paddle/fluid/operators/huber_loss_op_npu.cc   |   6 +-
 paddle/fluid/operators/im2sequence_op.h       |  37 ++-
 paddle/fluid/operators/index_sample_op_npu.cc |   5 +-
 paddle/fluid/operators/index_select_op.h      |   1 -
 paddle/fluid/operators/index_select_op_npu.cc |   8 +-
 paddle/fluid/operators/inplace_abn_op.cc      |  10 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   6 +-
 paddle/fluid/operators/inplace_abn_op.h       |   1 -
 paddle/fluid/operators/instance_norm_op.cc    |  12 +-
 paddle/fluid/operators/instance_norm_op.h     |   1 -
 .../fluid/operators/instance_norm_op_npu.cc   |   3 +-
 paddle/fluid/operators/interpolate_op.cu      |  16 +-
 paddle/fluid/operators/interpolate_op.h       |   3 +-
 paddle/fluid/operators/interpolate_op_npu.cc  |   5 +-
 .../fluid/operators/interpolate_v2_op_mlu.cc  |   4 +-
 .../fluid/operators/interpolate_v2_op_npu.cc  |  72 +++---
 paddle/fluid/operators/jit/benchmark.cc       |  35 ++-
 paddle/fluid/operators/kldiv_loss_op_npu.cc   |   4 +-
 paddle/fluid/operators/label_smooth_op_mlu.cc |   2 -
 paddle/fluid/operators/label_smooth_op_npu.cc |  10 +-
 paddle/fluid/operators/layer_norm_kernel.cu.h |   1 -
 paddle/fluid/operators/layer_norm_op.cc       |   7 +-
 paddle/fluid/operators/layer_norm_op_mlu.cc   |  17 +-
 paddle/fluid/operators/layer_norm_op_npu.cc   |  33 ++-
 paddle/fluid/operators/layout_utils.h         |   2 -
 .../fluid/operators/limit_by_capacity_op.cu   |   2 -
 paddle/fluid/operators/log_loss_op_npu.cc     |   2 -
 paddle/fluid/operators/log_loss_op_xpu.cc     |   2 -
 .../fluid/operators/lookup_table_dequant_op.h |   1 -
 paddle/fluid/operators/lookup_table_op.h      |   1 -
 paddle/fluid/operators/lookup_table_v2_op.h   |  11 +-
 .../fluid/operators/lookup_table_v2_op_mlu.cc |   4 +-
 .../fluid/operators/lookup_table_v2_op_npu.cc |  11 +-
 paddle/fluid/operators/lrn_op.h               |   3 -
 paddle/fluid/operators/lstm_op.h              |  40 ++--
 paddle/fluid/operators/lstmp_op.h             |  47 ++--
 .../fluid/operators/masked_select_op_mlu.cc   |  14 +-
 .../fluid/operators/match_matrix_tensor_op.cc |   3 +-
 .../fluid/operators/match_matrix_tensor_op.h  |   1 -
 paddle/fluid/operators/math/context_project.h |  45 ++--
 .../operators/math/eigen_values_vectors.h     |  26 +--
 paddle/fluid/operators/math/sample_prob.cu    |   4 +-
 paddle/fluid/operators/math/sample_prob.h     |   2 -
 .../fluid/operators/math/sequence_pooling.cc  |   5 +-
 paddle/fluid/operators/math/softmax.cu        |   5 +-
 paddle/fluid/operators/math/tree2col.cu       |   9 +-
 paddle/fluid/operators/matmul_op_mlu.cc       |  10 +-
 paddle/fluid/operators/matmul_op_npu.cc       |  27 ++-
 paddle/fluid/operators/matmul_v2_op_mlu.cc    |  10 +-
 paddle/fluid/operators/matmul_v2_op_npu.cc    |  21 +-
 paddle/fluid/operators/mean_iou_op.h          |   7 +-
 paddle/fluid/operators/mean_op_mlu.cc         |  17 +-
 paddle/fluid/operators/mean_op_npu.cc         |  21 +-
 paddle/fluid/operators/meshgrid_op_mlu.cc     |  12 +-
 .../operators/metrics/accuracy_op_mlu.cc      |  14 +-
 .../operators/metrics/accuracy_op_xpu.cc      |   1 -
 .../operators/metrics/precision_recall_op.h   |   1 -
 .../operators/mkldnn/dequantize_mkldnn_op.cc  |   1 -
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   |  49 ++--
 .../operators/mkldnn/quantize_mkldnn_op.cc    |   1 -
 .../operators/mkldnn/requantize_mkldnn_op.cc  |   1 -
 .../operators/mkldnn/reshape_mkldnn_op.cc     |   2 +-
 .../operators/mkldnn/transpose_mkldnn_op.cc   |  10 +-
 paddle/fluid/operators/mlu/mlu_baseop.cc      | 128 +++++-----
 paddle/fluid/operators/mlu/mlu_baseop.h       |  13 +-
 .../fluid/operators/modified_huber_loss_op.cu |   2 -
 .../fluid/operators/modified_huber_loss_op.h  |   1 -
 paddle/fluid/operators/multi_dot_op.cc        |   1 -
 paddle/fluid/operators/multinomial_op_npu.cc  |   2 -
 paddle/fluid/operators/multiplex_op.cc        |   2 -
 paddle/fluid/operators/nce_op.h               |  11 +-
 paddle/fluid/operators/norm_op_npu.cc         |   1 -
 paddle/fluid/operators/norm_utils.cu.h        |  29 ++-
 paddle/fluid/operators/number_count_op.cu     |   2 -
 paddle/fluid/operators/one_hot_op.h           |   1 -
 paddle/fluid/operators/one_hot_op_npu.cc      |   3 +-
 paddle/fluid/operators/one_hot_op_xpu.cc      |   2 -
 paddle/fluid/operators/one_hot_v2_op_mlu.cc   |  13 +-
 paddle/fluid/operators/one_hot_v2_op_npu.cc   |   3 +-
 .../fluid/operators/optimizers/adadelta_op.cc |   2 -
 .../fluid/operators/optimizers/adagrad_op.cc  |   1 -
 paddle/fluid/operators/optimizers/adam_op.h   |   2 -
 .../fluid/operators/optimizers/adam_op_mlu.cc |  14 +-
 .../fluid/operators/optimizers/adam_op_npu.cc |  14 +-
 .../fluid/operators/optimizers/adamax_op.cc   |   1 -
 .../optimizers/decayed_adagrad_op.cc          |   1 -
 paddle/fluid/operators/optimizers/dpsgd_op.cc |   1 -
 paddle/fluid/operators/optimizers/ftrl_op.cc  |   1 -
 paddle/fluid/operators/optimizers/ftrl_op.h   |   1 -
 .../operators/optimizers/merged_adam_op.cc    |   2 -
 .../optimizers/merged_momentum_op_mlu.cc      |   5 +-
 .../fluid/operators/optimizers/momentum_op.cc |  14 +-
 .../operators/optimizers/momentum_op_mlu.cc   |   4 +-
 .../optimizers/proximal_adagrad_op.cc         |   1 -
 .../optimizers/proximal_adagrad_op.h          |   2 -
 .../operators/optimizers/proximal_gd_op.cc    |   1 -
 .../operators/optimizers/proximal_gd_op.h     |   2 -
 .../operators/optimizers/rmsprop_op_npu.cc    |  14 +-
 .../optimizers/sparse_momentum_op.cc          |  19 +-
 paddle/fluid/operators/p_norm_op_npu.cc       |  21 +-
 paddle/fluid/operators/pad3d_op_npu.cc        |   2 -
 paddle/fluid/operators/pad_op_npu.cc          |   2 -
 paddle/fluid/operators/partial_concat_op.cc   |   1 -
 paddle/fluid/operators/partial_concat_op.cu   |   4 +-
 paddle/fluid/operators/partial_concat_op.h    |   1 -
 paddle/fluid/operators/partial_sum_op.cc      |   1 -
 paddle/fluid/operators/partial_sum_op.cu      |   6 +-
 paddle/fluid/operators/partial_sum_op.h       |   2 -
 paddle/fluid/operators/pool_op.cc             |   8 +-
 paddle/fluid/operators/pool_op.h              |   2 -
 paddle/fluid/operators/pool_op_mlu.cc         |  12 +-
 .../operators/positive_negative_pair_op.h     |   2 -
 paddle/fluid/operators/prelu_op.cc            |   2 -
 paddle/fluid/operators/prroi_pool_op.cc       |   2 -
 paddle/fluid/operators/prroi_pool_op.cu       |   2 -
 paddle/fluid/operators/pyramid_hash_op.cc     |   1 -
 paddle/fluid/operators/random_routing_op.cu   |   2 -
 paddle/fluid/operators/rank_attention_op.cc   |   1 -
 .../operators/reduce_ops/reduce_any_op_npu.cc |   1 -
 .../reduce_ops/reduce_any_op_npu_test.cc      |   2 -
 .../operators/reduce_ops/reduce_max_op_mlu.cc |  20 +-
 .../operators/reduce_ops/reduce_max_op_npu.cc |  15 +-
 .../reduce_ops/reduce_mean_op_mlu.cc          |   2 +-
 .../reduce_ops/reduce_mean_op_npu.cc          |   6 +-
 .../operators/reduce_ops/reduce_min_op_npu.cc |   5 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |   9 +-
 .../operators/reduce_ops/reduce_op_function.h |   1 -
 .../reduce_ops/reduce_prod_op_npu.cc          |   1 -
 .../operators/reduce_ops/reduce_sum_op.h      |   2 +-
 .../operators/reduce_ops/reduce_sum_op_mlu.cc |   2 +-
 .../operators/reduce_ops/reduce_sum_op_npu.cc |   2 +-
 paddle/fluid/operators/reshape_op.cc          |   8 +-
 paddle/fluid/operators/rnn_op_mlu.cc          |   3 +-
 paddle/fluid/operators/roi_align_op.cc        |   2 -
 paddle/fluid/operators/roi_align_op_mlu.cc    |  24 +-
 paddle/fluid/operators/roi_align_op_npu.cc    |  11 +-
 paddle/fluid/operators/roi_pool_op.cc         |   2 -
 paddle/fluid/operators/sample_logits_op.cu    |  15 +-
 paddle/fluid/operators/sample_logits_op.h     |  16 +-
 paddle/fluid/operators/sampling_id_op.cc      |   2 -
 paddle/fluid/operators/sampling_id_op.h       |   2 -
 paddle/fluid/operators/save_combine_op.cc     |   2 -
 paddle/fluid/operators/scatter_op_mlu.cc      |   2 +-
 paddle/fluid/operators/scatter_op_npu.cc      |  10 +-
 paddle/fluid/operators/search_compute.h       |   1 -
 paddle/fluid/operators/seed_op.cc             |   1 -
 paddle/fluid/operators/seed_op.h              |   1 -
 paddle/fluid/operators/set_value_op.cc        |  27 ++-
 paddle/fluid/operators/set_value_op.h         |   1 -
 paddle/fluid/operators/set_value_op_mlu.cc    |   8 +-
 paddle/fluid/operators/set_value_op_npu.cc    |   6 +-
 paddle/fluid/operators/shape_op_mlu.cc        |   3 +-
 paddle/fluid/operators/shape_op_npu.cc        |   2 -
 paddle/fluid/operators/shard_index_op_npu.cc  |  11 +-
 paddle/fluid/operators/shuffle_batch_op.h     |   1 -
 paddle/fluid/operators/shuffle_channel_op.cu  |   1 -
 ...igmoid_cross_entropy_with_logits_op_mlu.cc |   1 -
 ...igmoid_cross_entropy_with_logits_op_npu.cc |   1 -
 paddle/fluid/operators/similarity_focus_op.h  |   1 -
 paddle/fluid/operators/slice_op.cc            |   6 +-
 paddle/fluid/operators/slice_op_mlu.cc        |   2 -
 paddle/fluid/operators/slice_op_npu.cc        |   3 +-
 paddle/fluid/operators/smooth_l1_loss_op.h    |   7 +-
 .../fluid/operators/smooth_l1_loss_op_npu.cc  |  22 +-
 .../softmax_with_cross_entropy_op_mlu.cc      |   2 -
 .../softmax_with_cross_entropy_op_npu.cc      |   6 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   2 -
 paddle/fluid/operators/sparse_attention_op.cu |  48 ++--
 paddle/fluid/operators/split_op_mlu.cc        |   2 -
 paddle/fluid/operators/split_op_npu.cc        |   4 +-
 .../fluid/operators/squared_l2_distance_op.h  |   2 -
 .../fluid/operators/squared_l2_norm_op_mlu.cc |   6 +-
 .../fluid/operators/squared_l2_norm_op_npu.cc |   8 +-
 paddle/fluid/operators/stack_op_mlu.cc        |  10 +-
 paddle/fluid/operators/stack_op_npu.cc        |  18 +-
 paddle/fluid/operators/stft_op.h              |  16 +-
 paddle/fluid/operators/strided_slice_op.cc    |   6 +-
 .../fluid/operators/strided_slice_op_mlu.cc   |   9 +-
 .../fluid/operators/strided_slice_op_npu.cc   |  31 ++-
 paddle/fluid/operators/sum_op_mlu.cc          |   3 +-
 paddle/fluid/operators/sum_op_npu.cc          |   3 +-
 paddle/fluid/operators/svd_helper.h           |  38 +--
 .../fluid/operators/sync_batch_norm_op_mlu.cc |  29 ++-
 .../fluid/operators/sync_batch_norm_op_npu.cc | 142 ++++++------
 .../fluid/operators/take_along_axis_op_npu.cc |   2 -
 paddle/fluid/operators/tdm_child_op.h         |   1 -
 paddle/fluid/operators/tdm_sampler_op.h       |   1 -
 .../teacher_student_sigmoid_loss_op.cc        |  12 +-
 .../teacher_student_sigmoid_loss_op.h         |   1 -
 paddle/fluid/operators/temporal_shift_op.h    |   1 -
 paddle/fluid/operators/tile_op_mlu.cc         |   2 -
 paddle/fluid/operators/tile_op_npu.cc         |   1 -
 paddle/fluid/operators/top_k_op.cu            |   4 +-
 paddle/fluid/operators/top_k_op.h             |   2 -
 paddle/fluid/operators/top_k_op_npu.cc        |   2 +-
 paddle/fluid/operators/top_k_op_xpu.cc        |   1 -
 paddle/fluid/operators/tree_conv_op.h         |  13 +-
 .../truncated_gaussian_random_op_npu.cc       |  14 +-
 paddle/fluid/operators/uniform_random_op.cc   |   2 +-
 paddle/fluid/operators/uniform_random_op.cu   |   3 +-
 paddle/fluid/operators/uniform_random_op.h    |   1 -
 .../fluid/operators/uniform_random_op_mlu.cc  |   5 +-
 .../fluid/operators/uniform_random_op_npu.cc  |   5 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |  37 +--
 paddle/fluid/operators/var_conv_2d_op.h       |   1 -
 paddle/fluid/operators/where_index_op_mlu.cc  |   6 +-
 paddle/fluid/operators/where_index_op_npu.cc  |  12 +-
 419 files changed, 2450 insertions(+), 2880 deletions(-)

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index b57c874ceebe0..c1838ee201d45 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -644,11 +644,11 @@ void GradientAccumulator::CallGradientHooks() {
       true,
       platform::errors::PreconditionNotMet(
           "Only can call gradient hooks after sum gradient completed."));
-  PADDLE_ENFORCE_EQ(
-      HasInnerVar(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "Leaf Tensor's inner var is nullptr when "
+                        "call gradient hook."));
   PADDLE_ENFORCE_EQ(
       inner_var_->Var().IsInitialized(),
       true,
diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
index 9afa4c28e0544..e635b9547b4fc 100644
--- a/paddle/fluid/operators/abs_op_mlu.cc
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class AbsMLUKernel : public framework::OpKernel<T> {
  public:
@@ -54,7 +52,7 @@ class AbsGradMLUKernel : public framework::OpKernel<T> {
     MLUCnnlOpTensorDesc mul_op_desc(
         CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
-    Tensor sign_x;
+    phi::DenseTensor sign_x;
     sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
 
     MLUCnnl::Sign(ctx,
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
index a1ca88ae5b572..47c88abb9ede1 100644
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ b/paddle/fluid/operators/abs_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AbsNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index 736b398996b45..f26af0a5b9743 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <cnnlActivationMode_t act_mode, typename T>
 class ActivationMLUKernel : public framework::OpKernel<T> {
  public:
@@ -442,7 +440,7 @@ class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
-    Tensor square_out;
+    phi::DenseTensor square_out;
     square_out.Resize(out->dims());
     square_out.mutable_data<T>(place);
     MLUCnnlTensorDesc out_desc(*out);
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 3c6e207b971bc..b471c08d39ce9 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -24,14 +24,12 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PowNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto factor = ctx.Attr<float>("factor");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -54,9 +52,9 @@ template <typename DeviceContext, typename T>
 class PowGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto factor = ctx.Attr<float>("factor");
 
     auto x_dims = x->dims();
@@ -69,7 +67,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
 
     // Step1: Compute x_pow = x.pow(factor-1)
-    Tensor x_pow(x->type());
+    phi::DenseTensor x_pow(x->type());
     x_pow.mutable_data<T>(x->dims(), place);
     const auto& runner_pow = NpuOpRunner(
         "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
@@ -78,13 +76,13 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // Step 2: Construct a broadcast factor, which has the same shape with x.
 
     // 2.1 Get a factor tensor with shape [1].
-    Tensor factor_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor factor_tensor(experimental::DataType::FLOAT32);
     factor_tensor.mutable_data<float>({1}, place);
     FillNpuTensorWithConstant<float>(&factor_tensor, factor);
 
     // 2.2 Get the factor which has the shape with x and the same value with
     // factor.
-    Tensor factor_bc_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor factor_bc_tensor(experimental::DataType::FLOAT32);
     factor_bc_tensor.mutable_data<float>(x_dims, place);
     const auto& runner_bc = NpuOpRunner("FillD",
                                         {factor_tensor},
@@ -93,7 +91,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     runner_bc.Run(stream);
 
     // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
-    Tensor x_power_mul_factor(x->type());
+    phi::DenseTensor x_power_mul_factor(x->type());
     x_power_mul_factor.mutable_data<T>(x->dims(), place);
     const auto& runner_mul_1 =
         NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
@@ -111,8 +109,8 @@ template <typename DeviceContext, typename T>
 class ReluNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -134,9 +132,9 @@ template <typename DeviceContext, typename T>
 class ReluGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -153,8 +151,8 @@ template <typename DeviceContext, typename T>
 class Relu6NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
 
@@ -176,9 +174,9 @@ template <typename DeviceContext, typename T>
 class Relu6GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -195,9 +193,9 @@ template <typename DeviceContext, typename T>
 class SqrtNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -216,8 +214,8 @@ template <typename DeviceContext, typename T>
 class LeakyReluNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto alpha = ctx.Attr<float>("alpha");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -236,9 +234,9 @@ template <typename DeviceContext, typename T>
 class LeakyReluGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto alpha = ctx.Attr<float>("alpha");
 
     auto stream =
@@ -257,10 +255,10 @@ template <typename DeviceContext, typename T>
 class SqrtGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -279,9 +277,9 @@ template <typename DeviceContext, typename T>
 class LogNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -291,12 +289,12 @@ class LogNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor one(x->type());
+    phi::DenseTensor one(x->type());
     one.mutable_data<T>(x->dims(), place);
     const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
     runner_one.Run(stream);
 
-    Tensor sub(x->type());
+    phi::DenseTensor sub(x->type());
     sub.mutable_data<T>(x->dims(), place);
     const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
     runner_sub.Run(stream);
@@ -310,10 +308,10 @@ template <typename DeviceContext, typename T>
 class LogGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -331,9 +329,9 @@ template <typename DeviceContext, typename T>
 class TanhNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -352,10 +350,10 @@ template <typename DeviceContext, typename T>
 class TanhGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -374,9 +372,9 @@ template <typename DeviceContext, typename T>
 class SquareNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -395,9 +393,9 @@ template <typename DeviceContext, typename T>
 class SquareGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto factor = static_cast<float>(2.0);
 
@@ -406,7 +404,7 @@ class SquareGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     // Step 1: Compute x_muls_factor = factor * x
-    Tensor x_muls_factor(x->type());
+    phi::DenseTensor x_muls_factor(x->type());
     x_muls_factor.mutable_data<T>(x->dims(), place);
     const auto& runner_muls_1 =
         NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}});
@@ -424,9 +422,9 @@ template <typename DeviceContext, typename T>
 class SigmoidNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
@@ -445,10 +443,10 @@ template <typename DeviceContext, typename T>
 class SigmoidGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
 
@@ -469,8 +467,8 @@ template <typename T>
 class SwishNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     float beta = ctx.Attr<float>("beta");
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -494,9 +492,9 @@ template <typename T>
 class SwishGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     float beta = ctx.Attr<float>("beta");
 
     dx->mutable_data<T>(ctx.GetPlace());
@@ -504,7 +502,7 @@ class SwishGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor beta_x, sigmoid_out, swish_out;
+    phi::DenseTensor beta_x, sigmoid_out, swish_out;
     beta_x.mutable_data<T>(x->dims(), ctx.GetPlace());
     sigmoid_out.mutable_data<T>(x->dims(), ctx.GetPlace());
     swish_out.mutable_data<T>(x->dims(), ctx.GetPlace());
@@ -543,8 +541,8 @@ template <typename T>
 class HardSwishNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -558,25 +556,25 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor tensor_offset(x->type());
+    phi::DenseTensor tensor_offset(x->type());
     tensor_offset.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
 
-    Tensor add_offset_val(x->type());
+    phi::DenseTensor add_offset_val(x->type());
     add_offset_val.mutable_data<T>(x->dims(), place);
     const auto& runner_add =
         NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
     runner_add.Run(stream);
 
-    Tensor tensor_threshold(x->type());
+    phi::DenseTensor tensor_threshold(x->type());
     tensor_threshold.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast<T>(threshold));
 
-    Tensor tensor_zero(x->type());
+    phi::DenseTensor tensor_zero(x->type());
     tensor_zero.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_zero, static_cast<T>(0.0));
 
-    Tensor clip_val(x->type());
+    phi::DenseTensor clip_val(x->type());
     clip_val.mutable_data<T>(x->dims(), place);
     const auto& runner_clip =
         NpuOpRunner("ClipByValue",
@@ -584,10 +582,10 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
                     {clip_val});
     runner_clip.Run(stream);
 
-    Tensor tensor_scale_tmp(x->type());
+    phi::DenseTensor tensor_scale_tmp(x->type());
     tensor_scale_tmp.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast<T>(scale));
-    Tensor tensor_scale(x->type());
+    phi::DenseTensor tensor_scale(x->type());
     tensor_scale.mutable_data<T>(x->dims(), place);
     const auto& runner_fill =
         NpuOpRunner("FillD",
@@ -596,7 +594,7 @@ class HardSwishNPUKernel : public framework::OpKernel<T> {
                     {{"dims", phi::vectorize(x->dims())}});
     runner_fill.Run(stream);
 
-    Tensor div_val(x->type());
+    phi::DenseTensor div_val(x->type());
     div_val.mutable_data<T>(x->dims(), place);
     const auto& runner_div =
         NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val});
@@ -611,9 +609,9 @@ template <typename T>
 class HardSwishGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     float threshold = ctx.Attr<float>("threshold");
     float scale = ctx.Attr<float>("scale");
@@ -627,23 +625,23 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor tensor_offset(x->type());
+    phi::DenseTensor tensor_offset(x->type());
     tensor_offset.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
 
-    Tensor add_offset_val(x->type());
+    phi::DenseTensor add_offset_val(x->type());
     add_offset_val.mutable_data<T>(x->dims(), place);
     const auto& runner_add =
         NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
     runner_add.Run(stream);
 
-    Tensor tmp1(x->type());
+    phi::DenseTensor tmp1(x->type());
     tmp1.mutable_data<T>(x->dims(), place);
     const auto& runner_pow1 = NpuOpRunner(
         "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}});
     runner_pow1.Run(stream);
 
-    Tensor tmp2(x->type());
+    phi::DenseTensor tmp2(x->type());
     tmp2.mutable_data<T>(x->dims(), place);
     const auto& runner_ht_grad =
         NpuOpRunner("HardtanhGrad",
@@ -652,17 +650,17 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"min_val", 0.0f}, {"max_val", threshold}});
     runner_ht_grad.Run(stream);
 
-    Tensor tmp3(x->type());
+    phi::DenseTensor tmp3(x->type());
     tmp3.mutable_data<T>(x->dims(), place);
     const auto& runner_pow2 = NpuOpRunner(
         "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}});
     runner_pow2.Run(stream);
 
-    Tensor tensor_threshold_tmp(x->type());
+    phi::DenseTensor tensor_threshold_tmp(x->type());
     tensor_threshold_tmp.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
                                  static_cast<T>(threshold));
-    Tensor tensor_threshold(x->type());
+    phi::DenseTensor tensor_threshold(x->type());
     tensor_threshold.mutable_data<T>(x->dims(), place);
     const auto& runner_fill =
         NpuOpRunner("FillD",
@@ -671,12 +669,12 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"dims", phi::vectorize(x->dims())}});
     runner_fill.Run(stream);
 
-    Tensor tmp_bool(experimental::DataType::BOOL);
+    phi::DenseTensor tmp_bool(experimental::DataType::BOOL);
     tmp_bool.mutable_data<bool>(x->dims(), place);
     const auto& runner_less =
         NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool});
     runner_less.Run(stream);
-    Tensor tmp4(x->type());
+    phi::DenseTensor tmp4(x->type());
     tmp4.mutable_data<T>(x->dims(), place);
     auto dst_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
@@ -687,7 +685,7 @@ class HardSwishGradNPUKernel : public framework::OpKernel<T> {
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     runner_cast.Run(stream);
 
-    Tensor tmp5(x->type());
+    phi::DenseTensor tmp5(x->type());
     tmp5.mutable_data<T>(x->dims(), place);
     const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5});
     runner_sub.Run(stream);
@@ -701,8 +699,8 @@ template <typename DeviceContext, typename T>
 class HardSigmoidNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
 
@@ -724,10 +722,10 @@ template <typename DeviceContext, typename T>
 class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
 
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     float slope = ctx.Attr<float>("slope");
     float offset = ctx.Attr<float>("offset");
@@ -751,8 +749,8 @@ template <typename DeviceContext, typename T>
 class ReciprocalNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     auto stream =
@@ -767,9 +765,9 @@ template <typename DeviceContext, typename T>
 class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
     auto stream =
@@ -785,8 +783,8 @@ template <typename DeviceContext, typename T>
 class CosNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
@@ -804,14 +802,14 @@ template <typename DeviceContext, typename T>
 class CosGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
 
-    Tensor sin_out(x->type());  // Temporary Tensor
+    phi::DenseTensor sin_out(x->type());  // Temporary phi::DenseTensor
     sin_out.Resize(x->dims());
     sin_out.mutable_data<T>(place);
 
@@ -824,7 +822,7 @@ class CosGradNPUKernel : public framework::OpKernel<T> {
     const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {});
     runner_dx.Run(stream);
 
-    Tensor tmp(x->type());  // Temporary Tensor
+    phi::DenseTensor tmp(x->type());  // Temporary phi::DenseTensor
     tmp.Resize(phi::make_ddim({1, 1}));
     tmp.mutable_data<T>(place);
     float factor = -1.;
@@ -840,8 +838,8 @@ template <typename DeviceContext, typename T>
 class AtanNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
     const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
@@ -856,9 +854,9 @@ template <typename DeviceContext, typename T>
 class AtanGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto place = ctx.GetPlace();
     dx->mutable_data<T>(place);
     auto stream =
@@ -888,9 +886,9 @@ template <typename DeviceContext, typename T>
 class ExpGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<Tensor>("Out");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out = ctx.Input<phi::DenseTensor>("Out");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -904,9 +902,9 @@ template <typename DeviceContext, typename T>
 class SinNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
 
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
 
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index 2d7eb04f1dba0..b23d3670d5e80 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AffineGridOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index 508c51de723c0..6c78a20e2a51a 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AllocFloatStatusKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
index 5f5415ffd37d0..543b40ee8fcd0 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
   using MPDType = typename details::MPTypeTrait<T>::Type;
@@ -45,7 +43,7 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       out->mutable_data<T>(ctx.GetPlace());
 
       // check is_finite or is_nan
-      Tensor is_finite(found_inf->type());
+      phi::DenseTensor is_finite(found_inf->type());
       if (i != 0) {
         is_finite.Resize(phi::make_ddim({1}));
         is_finite.mutable_data<bool>(ctx.GetPlace());
@@ -78,8 +76,8 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
       // out = in/scale, if found_inf = false
       // But when found_inf is true, the data of Out should not be used.
       // So, on MLU, we always compute out with in/scale.
-      Tensor float_x;
-      Tensor float_out;
+      phi::DenseTensor float_x;
+      phi::DenseTensor float_out;
       if (std::is_same<T, paddle::platform::float16>::value) {
         float_x.Resize(x->dims());
         float_out.Resize(out->dims());
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 3b6e2ba7184c0..c65b889618f07 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
 // On NPU, we do not really check the data of input tensors,
 // but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
@@ -47,13 +45,13 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // step1: inverse scale
-    Tensor const_tensor;
+    phi::DenseTensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
 
     // Inverse(1.0/scale)
     phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
-    Tensor inverse_out(scale->type());
+    phi::DenseTensor inverse_out(scale->type());
     inverse_out.Resize(scale->dims());
     inverse_out.mutable_data<T>(ctx.GetPlace());
     const auto& runner_inverse =
@@ -62,7 +60,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     tmp_inverse_out = &inverse_out;
 
     // NOTE(zhiqiu):
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
     // tmp is only placeholder.
@@ -73,7 +71,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
                     {{"message", std::string("check_nan_and_inf")}});
     runner_float_status.Run(stream);
 
-    Tensor sum;
+    phi::DenseTensor sum;
     sum.mutable_data<float>({1}, ctx.GetPlace());
     const auto& runner_reduce_sum =
         NpuOpRunner("ReduceSumD",
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index cca370bf95331..bf7272ba8b878 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -31,8 +31,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = phi::DenseTensor;
-
 USE_OP_ITSELF(check_finite_and_unscale);
 USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
 
@@ -110,7 +108,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
   ctx.Wait();
 
   // out found_inf
-  Tensor found_inf_tensor;
+  phi::DenseTensor found_inf_tensor;
   found_inf_tensor.Resize({1});
   bool *found_inf_data =
       found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
index b5750181139d4..18e68e1ba377f 100644
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClearFloatStatusKernel : public framework::OpKernel<T> {
  public:
@@ -35,7 +33,7 @@ class ClearFloatStatusKernel : public framework::OpKernel<T> {
                       platform::errors::PreconditionNotMet(
                           "The input(FloatStatus) and Output(FloatStatusOut) "
                           "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out});
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
index 8befb2df9b835..c6dd6f4e6b968 100644
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
  public:
@@ -35,7 +33,7 @@ class GetFloatStatusKernel : public framework::OpKernel<T> {
                       platform::errors::PreconditionNotMet(
                           "The input(FloatStatus) and Output(FloatStatusOut) "
                           "should be the same."));
-    Tensor tmp;
+    phi::DenseTensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index b1bfcf8edd672..fb5475610ce15 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -25,8 +25,6 @@ DECLARE_int32(min_loss_scaling);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void Update(const platform::NPUDeviceContext& ctx,
             const std::vector<bool> found_inf_vec,
@@ -50,7 +48,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                              good_out_tensor->numel() * sizeof(int),
                              stream);
     // bad_out_data = bad_in_data + 1
-    Tensor factor_tensor(bad_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(bad_out_tensor->dtype());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     const auto& runner_p2 = NpuOpRunner(
@@ -106,7 +104,7 @@ void Update(const platform::NPUDeviceContext& ctx,
                              stream);
 
     // good_out_data = good_in_data + 1
-    Tensor factor_tensor(good_out_tensor->dtype());
+    phi::DenseTensor factor_tensor(good_out_tensor->dtype());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     const auto& runner_p2 = NpuOpRunner(
diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index 6e5048db47ead..175703eaf9fa5 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -36,7 +35,7 @@ struct VisitDataArgNPUMaxFunctor {
     auto dtype = ctx.Attr<int>("dtype");
     const bool& flatten = ctx.Attr<bool>("flatten");
 
-    Tensor transformed_x(x.type());
+    phi::DenseTensor transformed_x(x.type());
     transformed_x.ShareDataWith(x);
     if (flatten) {
       transformed_x.Resize(phi::make_ddim({x.numel()}));
diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc
index fe917140b7b9f..5132393cd3727 100644
--- a/paddle/fluid/operators/arg_min_op_npu.cc
+++ b/paddle/fluid/operators/arg_min_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ArgMinNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
index 7aedb41c9fde3..d5a42b8228e0a 100644
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ b/paddle/fluid/operators/argsort_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -79,16 +78,16 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
     framework::NPUAttributeMap attr = {{"axis", -1},
                                        {"descending", descending}};
 
-    Tensor indices_tmp(experimental::DataType::INT32);
+    phi::DenseTensor indices_tmp(experimental::DataType::INT32);
     indices_tmp.Resize(indices->dims());
 
     if (framework::TransToProtoVarType(input->dtype()) ==
         framework::proto::VarType::INT64) {
-      Tensor input_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor input_fp32(experimental::DataType::FLOAT32);
       input_fp32.Resize(input->dims());
       CastToFP32(ctx, stream, *input, &input_fp32);
 
-      Tensor output_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor output_fp32(experimental::DataType::FLOAT32);
       output_fp32.Resize(output->dims());
 
       if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -112,12 +111,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
         }
         auto trans_dims = phi::make_ddim(shape);
 
-        Tensor trans_input(input_fp32.type());
+        phi::DenseTensor trans_input(input_fp32.type());
         trans_input.Resize(trans_dims);
         TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
 
-        Tensor trans_output(input_fp32.type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input_fp32.type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
         trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
         trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
 
@@ -150,12 +149,12 @@ class ArgsortNPUKernel : public framework::OpKernel<T> {
         }
         auto trans_dims = phi::make_ddim(shape);
 
-        Tensor trans_input(input->type());
+        phi::DenseTensor trans_input(input->type());
         trans_input.Resize(trans_dims);
         TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
 
-        Tensor trans_output(input->type());
-        Tensor trans_indices(experimental::DataType::INT32);
+        phi::DenseTensor trans_output(input->type());
+        phi::DenseTensor trans_indices(experimental::DataType::INT32);
         trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
         trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
 
@@ -183,12 +182,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
       phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
   const int64_t input_width = in_dims[in_dims.size() - 1];
 
-  Tensor input_tmp;
+  phi::DenseTensor input_tmp;
   input_tmp.ShareDataWith(input);
   input_tmp.Resize(
       phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
 
-  Tensor indices_tmp;
+  phi::DenseTensor indices_tmp;
   indices_tmp.ShareDataWith(indices);
   indices_tmp.Resize(
       phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
@@ -197,12 +196,12 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
   for (Type i = 0; i < input_height; i++) {
     indexs_value.push_back(i * input_width);
   }
-  Tensor indexs_tmp(indices.type());
+  phi::DenseTensor indexs_tmp(indices.type());
   framework::TensorFromVector<int64_t>(
       indexs_value, ctx.device_context(), &indexs_tmp);
   indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
 
-  Tensor indices_index(indices.type());
+  phi::DenseTensor indices_index(indices.type());
   indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
   const auto& runner_add =
       NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
@@ -212,7 +211,7 @@ static void FullAssignNPU(const framework::ExecutionContext& ctx,
       phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
 
   t_out->mutable_data<T>(ctx.GetPlace());
-  Tensor out_tmp(t_out->type());
+  phi::DenseTensor out_tmp(t_out->type());
   out_tmp.ShareDataWith(*t_out);
 
   const auto& runner = NpuOpRunner("TensorScatterUpdate",
@@ -252,15 +251,15 @@ class ArgsortGradNPUKernel : public framework::OpKernel<T> {
       }
       auto trans_dims = phi::make_ddim(shape);
 
-      Tensor trans_dout(dO->type());
-      Tensor trans_ids(indices->type());
+      phi::DenseTensor trans_dout(dO->type());
+      phi::DenseTensor trans_ids(indices->type());
       trans_dout.Resize(trans_dims);
       trans_ids.Resize(trans_dims);
 
       TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
       TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
 
-      Tensor trans_dx(dO->type());
+      phi::DenseTensor trans_dx(dO->type());
       trans_dx.Resize(trans_dims);
       FullAssignNPU<T, int64_t>(
           ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index d3ae66b3c02ff..9dff9a05d73ad 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -212,39 +212,41 @@ void AttentionLSTMOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   AddInput("C0",
-           "(Tensor) LSTM C0"
+           "(phi::DenseTensor) LSTM C0"
            "This is a tensor with shape (N x D), where N is the batch size, D "
            "is the gate size."
            "C0 is necessary because of attention.");
   AddInput("H0",
-           "(Tensor, optional) LSTM H0"
+           "(phi::DenseTensor, optional) LSTM H0"
            "This is a tensor with shape (N x D), where N is the "
            "batch size and D is the gate size.")
       .AsDispensable();
   AddInput("AttentionWeight",
-           "(Tensor) the weights of attention fc. Always relu the fc result."
+           "(phi::DenseTensor) the weights of attention fc. Always relu the fc "
+           "result."
            "The shape is ((M+D) x 1), where M is the dim size of x, D is the "
            "gate size of LSTM.");
   AddInput("AttentionBias",
-           "(Tensor, optional) the bias of attention fc."
+           "(phi::DenseTensor, optional) the bias of attention fc."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("AttentionScalar",
-           "(Tensor, optional) the scalar on the result of attentioned fc. "
+           "(phi::DenseTensor, optional) the scalar on the result of "
+           "attentioned fc. "
            "Always relu the Scalar."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("AttentionScalarBias",
-           "(Tensor, optional) the scalar bias of attention fc."
+           "(phi::DenseTensor, optional) the scalar bias of attention fc."
            "The shape is (1 x 1)")
       .AsDispensable();
   AddInput("LSTMWeight",
-           "(Tensor) the combined weight of LSTM"
+           "(phi::DenseTensor) the combined weight of LSTM"
            " - The shape is ((D+M) x 4D), where D is the hidden gate size, M "
            "is the dim size of x"
            " - Weight = {W_forget, W_input, W_output, W_cell}");
   AddInput("LSTMBias",
-           "(Tensor) the combined bias of LSTM, shape (1x4D)."
+           "(phi::DenseTensor) the combined bias of LSTM, shape (1x4D)."
            "Note: we should add the bias of hidden and context accorindg to "
            "the same gate: "
            "{B_forget, B_input, B_output, B_cell}");
@@ -257,21 +259,22 @@ void AttentionLSTMOpMaker::Make() {
       "(phi::DenseTensor) (same as LSTMOp) the cell state of LSTM operator. "
       "The shape is (T x D), and lod is the same with the `Input`.");
   AddOutput("AttentionedX",
-            "(Tensor) shape is (T x 1), the result after X * AttentionWeight,"
+            "(phi::DenseTensor) shape is (T x 1), the result after X * "
+            "AttentionWeight,"
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size.")
       .AsIntermediate();
   AddOutput("AttentionFCOut",
-            "(Tensor) (max_seq_len, 1), compute at each step.")
+            "(phi::DenseTensor) (max_seq_len, 1), compute at each step.")
       .AsIntermediate();
   AddOutput("LSTMX",
-            "(Tensor) the input X of LSTM for each step."
+            "(phi::DenseTensor) the input X of LSTM for each step."
             "Shape is (1 x M), where M is the x frame size")
       .AsIntermediate();
-  AddOutput(
-      "LSTMOUT",
-      "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step."
-      "Shape is (1 x 4D), where M is the x frame size")
+  AddOutput("LSTMOUT",
+            "(phi::DenseTensor) the output of LSTM X(1*(D+M))* "
+            "weight((D+M)*4D) for each step."
+            "Shape is (1 x 4D), where M is the x frame size")
       .AsIntermediate();
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h
index 41d7d594df207..0ce83be93c6cc 100644
--- a/paddle/fluid/operators/attention_lstm_op.h
+++ b/paddle/fluid/operators/attention_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AttentionLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index abf177ee9f9f4..b4a24c84bcc45 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -207,7 +207,7 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
 
 framework::OpKernelType BatchNormOp::GetKernelTypeForVar(
     const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
     const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -265,7 +265,7 @@ void BatchNormOpMaker::Make() {
            "The global variance (for training) "
            "or estimated Variance (for testing)");
   AddInput("MomentumTensor",
-           "(Tensor<float32>, optional) If provided, batch_norm will "
+           "(phi::DenseTensor<float32>, optional) If provided, batch_norm will "
            "use this as momentum, this has a higher priority than "
            "attr(momentum), the shape of this tensor MUST BE [1].")
       .AsDispensable();
@@ -380,9 +380,9 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::InvalidArgument("can't find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
@@ -397,7 +397,7 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
 
 framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
     const std::string &var_name,
-    const Tensor &tensor,
+    const phi::DenseTensor &tensor,
     const framework::OpKernelType &expected_kernel_type) const {
 #ifdef PADDLE_WITH_MKLDNN
   // Only input require reshaping, weights and
@@ -522,9 +522,9 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 29c40f1b41ef8..e643efcb8b9f5 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -34,7 +34,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index b11deeb49509b..40cdb68329fb2 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc
index a2ed462b0fe7b..77397552333d4 100644
--- a/paddle/fluid/operators/batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/batch_norm_op_mlu.cc
@@ -78,8 +78,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
     saved_mean->mutable_data<MPDType>(place);
     saved_variance->mutable_data<MPDType>(place);
 
-    Tensor transformed_x;
-    Tensor transformed_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_y;
     const int transformed_dim_size = 4;
     const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
     MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
@@ -116,7 +116,7 @@ class MLUBatchNormOpKernel : public framework::OpKernel<T> {
 
     if (ctx.HasInput("MomentumTensor")) {
       const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-      Tensor mom_cpu;
+      phi::DenseTensor mom_cpu;
       framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
       momentum = mom_cpu.data<float>()[0];
     }
@@ -226,9 +226,9 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
-    Tensor transformed_d_y;
-    Tensor transformed_x;
-    Tensor transformed_d_x;
+    phi::DenseTensor transformed_d_y;
+    phi::DenseTensor transformed_x;
+    phi::DenseTensor transformed_d_x;
     const int transformed_dim_size = 4;
     const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
 
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index 244b76ff86be9..94c2f7297b821 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -89,7 +89,7 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       // is only used in this training branch
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
diff --git a/paddle/fluid/operators/bce_loss_op_mlu.cc b/paddle/fluid/operators/bce_loss_op_mlu.cc
index 99fd402424e7c..6541de153d4be 100644
--- a/paddle/fluid/operators/bce_loss_op_mlu.cc
+++ b/paddle/fluid/operators/bce_loss_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class BCELossMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
index c6b2d12ac535e..5918bee19453c 100644
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ b/paddle/fluid/operators/bce_loss_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BCELossNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 5505d3b4e3250..10b25fc478744 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -40,7 +40,7 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 Cast Operator.
 
 This Operator casts the input tensor to another data type and
-returns the Output Tensor. It's meaningless if the output dtype equals
+returns the Output phi::DenseTensor. It's meaningless if the output dtype equals
 the input dtype, but it's fine if you do so.
 
 )DOC");
diff --git a/paddle/fluid/operators/cast_op_mlu.cc b/paddle/fluid/operators/cast_op_mlu.cc
index 7e85702eee4b1..cb0bc659fbb0f 100644
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CastMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index 9c430fc0ffe30..0e2775efd1328 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -32,8 +32,6 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CastNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h
index 989a27f552118..36fe957102bfb 100644
--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -81,7 +80,7 @@ class CenterLossKernel : public framework::OpKernel<T> {
 
     auto loss_data = out_loss->mutable_data<T>(ctx.GetPlace());
 
-    Tensor centers_diffacc;  // used to accumulate all diff
+    phi::DenseTensor centers_diffacc;  // used to accumulate all diff
     auto centers_diffacc_data =
         centers_diffacc.mutable_data<T>(centers_dim, ctx.GetPlace());
     int numel = centers_diffacc.numel();
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 21658be577ebd..f54e323eefb44 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 // using SelectedRows = phi::SelectedRows;
 template <typename T,
           int MajorType = Eigen::RowMajor,
diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
index 8f0ca6dfc7635..2ccd87d58bea7 100644
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUClipByNormKernel : public framework::OpKernel<T> {
  public:
@@ -48,7 +46,7 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                                 "Input(X) of ClipByNormOp should not be null. "
                                 "Please check if it is created correctly."));
 
-    Tensor square_sum(input->type());
+    phi::DenseTensor square_sum(input->type());
     square_sum.mutable_data<T>(framework::DDim({1}), place);
     const auto& x_dims = input->dims();
     std::vector<int> axis;
@@ -62,12 +60,12 @@ class NPUClipByNormKernel : public framework::OpKernel<T> {
                     {{"axis", axis}, {"keep_dims", false}});
     square_sum_runner.Run(stream);
 
-    Tensor x_norm(input->type());
+    phi::DenseTensor x_norm(input->type());
     x_norm.mutable_data<T>(framework::DDim({1}), place);
     const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
     x_norm_runner.Run(stream);
 
-    Tensor x_norm_t;
+    phi::DenseTensor x_norm_t;
     framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
     auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
     if (x_norm_v <= max_norm) {
diff --git a/paddle/fluid/operators/clip_op_mlu.cc b/paddle/fluid/operators/clip_op_mlu.cc
index daced778a95dc..f84a493d6d399 100644
--- a/paddle/fluid/operators/clip_op_mlu.cc
+++ b/paddle/fluid/operators/clip_op_mlu.cc
@@ -29,7 +29,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
     auto max = static_cast<T>(ctx.Attr<float>("max"));
 
     if (ctx.HasInput("Min")) {
-      Tensor min_cpu;
+      phi::DenseTensor min_cpu;
       auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
       auto* min_data = min_tensor->data<T>();
       if (platform::is_mlu_place(min_tensor->place())) {
@@ -41,7 +41,7 @@ class ClipMLUKernel : public framework::OpKernel<T> {
     }
 
     if (ctx.HasInput("Max")) {
-      Tensor max_cpu;
+      phi::DenseTensor max_cpu;
       auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
       auto* max_data = max_tensor->data<T>();
       if (platform::is_mlu_place(max_tensor->place())) {
@@ -80,7 +80,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
       framework::TensorCopy(
           *min_tensor,
           platform::CPUPlace(),
@@ -91,7 +91,7 @@ class ClipGradMLUKernel : public framework::OpKernel<T> {
     }
     auto max_val = ctx.Attr<float>("max");
     if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
       framework::TensorCopy(
           *max_tensor,
           platform::CPUPlace(),
diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc
index 19ae23add0e10..82056ab0acb4a 100644
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ b/paddle/fluid/operators/clip_op_npu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ClipNPUKernel : public framework::OpKernel<T> {
  public:
@@ -33,8 +31,8 @@ class ClipNPUKernel : public framework::OpKernel<T> {
     auto max_tensor =
         ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
 
-    Tensor min_tensor_temp(x->type());
-    Tensor max_tensor_temp(x->type());
+    phi::DenseTensor min_tensor_temp(x->type());
+    phi::DenseTensor max_tensor_temp(x->type());
     if (min_tensor == nullptr) {
       auto min_value = static_cast<T>(ctx.Attr<float>("min"));
       min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
@@ -74,7 +72,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {
 
     auto min_val = ctx.Attr<float>("min");
     if (min_tensor) {
-      Tensor min_data;
+      phi::DenseTensor min_data;
       framework::TensorCopy(
           *min_tensor,
           platform::CPUPlace(),
@@ -86,7 +84,7 @@ class ClipGradNPUKernel : public framework::OpKernel<T> {
 
     auto max_val = ctx.Attr<float>("max");
     if (max_tensor) {
-      Tensor max_data;
+      phi::DenseTensor max_data;
       framework::TensorCopy(
           *max_tensor,
           platform::CPUPlace(),
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 6bdfe9e8b754f..75e6df4baf82b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -61,7 +61,7 @@ struct FillConstantVisitor {
                  * = nullptr) const {
 #ifdef PADDLE_WITH_ASCEND_CL
     if (platform::is_npu_place(dev_ctx_.GetPlace())) {
-      Tensor tensor_tmp(framework::TransToPhiDataType(dtype_));
+      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(dtype_));
       tensor_tmp.mutable_data<T>({1}, context_.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value_));
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 8d3af26f0c254..6920d51eb2637 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -151,10 +151,9 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
                         aclrtStream stream,
                         const phi::DenseTensor* in) {
-  using Tensor = phi::DenseTensor;
-  Tensor out(in->type());
+  phi::DenseTensor out(in->type());
 
-  Tensor mean(in->type());
+  phi::DenseTensor mean(in->type());
   mean.Resize({1});
   mean.mutable_data<float>(dev_ctx.GetPlace());
   std::vector<int> axes;
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 0881b702ec0d8..40a0cb196f3bb 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -126,7 +124,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     const int N = phi::funcs::SizeToAxis(axis, logits_dims);
     const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
     softmax_2d.ShareDataWith(*softmax).Resize({N, D});
     loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -135,7 +133,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
 
     // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
     logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* logits_max_buff = logits_max.mutable_data<T>(place);
 
@@ -163,7 +161,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
             .unaryExpr(math::ValueClip<T>());
 
     // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
     predicted_logits =
         ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
@@ -215,7 +213,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
 
     // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
     sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
@@ -278,7 +276,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     const int N = phi::funcs::SizeToAxis(axis, logits_dims);
     const int D = phi::funcs::SizeFromAxis(axis, logits_dims);
 
-    Tensor logits_2d, softmax_2d, loss_2d;
+    phi::DenseTensor logits_2d, softmax_2d, loss_2d;
     logits_2d.ShareDataWith(*logits).Resize({N, D});
     softmax_2d.ShareDataWith(*softmax).Resize({N, D});
     loss_2d.ShareDataWith(*loss).Resize({N, 1});
@@ -287,7 +285,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
 
     // step 1, obtain logit_max
-    Tensor logits_max;
+    phi::DenseTensor logits_max;
     logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
 
     auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
@@ -309,7 +307,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
             .unaryExpr(math::ValueClip<T>());
 
     // step 3, obtain predict target
-    Tensor predicted_logits;
+    phi::DenseTensor predicted_logits;
     predicted_logits =
         ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
@@ -355,7 +353,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
 
     // step 5, obtain sum_exp_logits
-    Tensor sum_exp_logits;
+    phi::DenseTensor sum_exp_logits;
     sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
@@ -405,7 +403,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims);
     const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims);
 
-    Tensor logit_grad_2d;
+    phi::DenseTensor logit_grad_2d;
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
 
     int blocks = NumBlocks(N * D);
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index ae65930b86ac0..0c6e7b31c9d2e 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class ConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index b73460f2057e4..ebfd2895e783b 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -119,7 +119,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
             out_grad->dims().size()));
     // get output tensor that the name is not kEmptyVarName
     std::vector<void*> outputs_vec;
-    std::vector<Tensor> tmp_outputs_vec;
+    std::vector<phi::DenseTensor> tmp_outputs_vec;
     std::vector<MLUCnnlTensorDesc> output_descs;
     std::vector<cnnlTensorDescriptor_t> descs_vec;
     for (size_t j = 0; j < outs.size(); ++j) {
@@ -129,7 +129,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
         output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
         outputs_vec.push_back(GetBasePtr(outs[j]));
       } else {
-        Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
         tmp_outputs_vec.push_back(tmp_tensor);
         output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
diff --git a/paddle/fluid/operators/controlflow/logical_op_mlu.cc b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
index 5e1630447b9de..7f63513af7bac 100644
--- a/paddle/fluid/operators/controlflow/logical_op_mlu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T, cnnlLogicOp_t log_method>
 class LogicalMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 7c2c11bbfb40e..38ffa202efa92 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class LogicalNotNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 924ed1fcf7d35..62bcfb545e00f 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // Base convolution operator definations for other conv
 // like operators to reuse the implementation.
 inline int ConvOutputSize(
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index d0067d5c5930a..214af06bbd7c7 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -56,8 +55,8 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     if (channel_last) {
       input_tensor.ShareDataWith(*input);
@@ -78,7 +77,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
     output_tensor.set_layout(DataLayout::kNHWC);
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -166,8 +165,8 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
     if (channel_last) {
@@ -193,7 +192,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
 
       auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
       temp_filter_grad.mutable_data<T>({filter_grad_dims[0],
                                         filter_grad_dims[2],
                                         filter_grad_dims[3],
@@ -234,7 +233,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
 
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       if (channel_last) {
         input_grad_tensor.ShareDataWith(*input_grad);
       } else {
@@ -248,7 +247,7 @@ class MLUConvGradOpKernel : public framework::OpKernel<T> {
       input_grad_tensor.set_layout(DataLayout::kNHWC);
 
       // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
       TransposeFromMLUTensor<T>(ctx,
                                 perm_to_nhwc,
                                 filter,
@@ -326,8 +325,8 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     if (channel_last) {
       groups = in_dims[3];
@@ -350,7 +349,7 @@ class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
     output_tensor.set_layout(DataLayout::kNHWC);
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -438,8 +437,8 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
     UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
     const std::vector<int> perm_hwcm_to_mchw = {3, 2, 0, 1};
@@ -469,7 +468,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
 
       auto filter_grad_dims = filter_grad->dims();
-      Tensor temp_filter_grad(filter_grad->type());
+      phi::DenseTensor temp_filter_grad(filter_grad->type());
       // Details about setting diff_w hwcn for better performance, see the CNNL
       // documentation.
       temp_filter_grad.mutable_data<T>({filter_grad_dims[perm_mchw_to_hwcm[0]],
@@ -512,7 +511,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
 
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       if (channel_last) {
         input_grad_tensor.ShareDataWith(*input_grad);
       } else {
@@ -526,7 +525,7 @@ class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
       input_grad_tensor.set_layout(DataLayout::kNHWC);
 
       // transpose filter from MCHW to MHWC
-      Tensor trans_filter(filter->type());
+      phi::DenseTensor trans_filter(filter->type());
       TransposeFromMLUTensor<T>(ctx,
                                 perm_to_nhwc,
                                 filter,
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index f4c7de95483b5..6b8f7118473a5 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 static void CastToFP16(const framework::ExecutionContext& ctx,
                        const aclrtStream& stream,
@@ -104,7 +103,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
 
@@ -125,7 +124,7 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
     // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
     transformed_filter.mutable_data<T>({filter->dims()[1],
                                         filter->dims()[0],
                                         filter->dims()[2],
@@ -189,7 +188,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
     // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    Tensor transformed_filter(filter->type());
+    phi::DenseTensor transformed_filter(filter->type());
     transformed_filter.mutable_data<T>({filter->dims()[1],
                                         filter->dims()[0],
                                         filter->dims()[2],
@@ -204,7 +203,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -247,7 +246,7 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
     }
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -305,7 +304,7 @@ class NPUConvOpKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
     if (channel_last) {
@@ -378,7 +377,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -400,7 +399,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
-      Tensor filter_grad_fp32(experimental::DataType::FLOAT32);
+      phi::DenseTensor filter_grad_fp32(experimental::DataType::FLOAT32);
       filter_grad_fp32.Resize(filter_grad->dims());
 
       if (framework::TransToProtoVarType(input->dtype()) ==
@@ -430,7 +429,7 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
 
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -617,8 +616,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       filter_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
-      Tensor filter_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          filter_grad->dims(), dev_ctx);
+      phi::DenseTensor filter_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter_grad->dims(),
+                                                     dev_ctx);
       filter_grad_tensor.ShareDataWith(*filter_grad);
       filter_grad_tensor.set_layout(DataLayout::kNCDHW);
 
@@ -638,8 +638,9 @@ class NPUConv3dGradKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
 
-      Tensor input_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-          input_grad->dims(), dev_ctx);
+      phi::DenseTensor input_grad_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(input_grad->dims(),
+                                                     dev_ctx);
       input_grad_tensor.ShareDataWith(*input_grad);
       input_grad_tensor.set_layout(DataLayout::kNCDHW);
 
diff --git a/paddle/fluid/operators/conv_transpose_op_mlu.cc b/paddle/fluid/operators/conv_transpose_op_mlu.cc
index c2d68523d48cc..36d0be10575d1 100644
--- a/paddle/fluid/operators/conv_transpose_op_mlu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -61,8 +60,8 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
     phi::UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_tensor(output->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_tensor(output->type());
     input_tensor.set_layout(DataLayout::kNHWC);
     output_tensor.set_layout(DataLayout::kNHWC);
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -84,7 +83,7 @@ class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
     }
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -168,8 +167,8 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
     phi::UpdatePaddingAndDilation(
         &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
 
-    Tensor input_tensor(input->type());
-    Tensor output_grad_tensor(output_grad->type());
+    phi::DenseTensor input_tensor(input->type());
+    phi::DenseTensor output_grad_tensor(output_grad->type());
     output_grad_tensor.set_layout(DataLayout::kNHWC);
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
@@ -191,7 +190,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // transpose filter from MCHW to MHWC
-    Tensor trans_filter(filter->type());
+    phi::DenseTensor trans_filter(filter->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
@@ -217,7 +216,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
 
     if (filter_grad) {
       filter_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor filter_grad_tensor(filter_grad->type());
+      phi::DenseTensor filter_grad_tensor(filter_grad->type());
       // filter_grad always MCHW
       // filter_grad_tensor always MHWC
       auto filter_grad_dims = filter_grad->dims();
@@ -253,7 +252,7 @@ class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor(input_grad->type());
+      phi::DenseTensor input_grad_tensor(input_grad->type());
       input_tensor.set_layout(DataLayout::kNHWC);
 
       if (channel_last) {
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
index 2f674de03f7a2..3723a4841af30 100644
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ b/paddle/fluid/operators/conv_transpose_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -65,7 +64,7 @@ class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(4, 1);
     std::vector<int> dilations(4, 1);
 
-    Tensor input_tensor, output_tensor;
+    phi::DenseTensor input_tensor, output_tensor;
     input_tensor.ShareDataWith(*input);
     output_tensor.ShareDataWith(*output);
 
@@ -148,7 +147,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides_vec(4, 1);
     std::vector<int> dilations_vec(4, 1);
 
-    Tensor input_tensor, output_grad_tensor;
+    phi::DenseTensor input_tensor, output_grad_tensor;
     input_tensor.ShareDataWith(*input);
     output_grad_tensor.ShareDataWith(*output_grad);
     if (channel_last) {
@@ -182,7 +181,7 @@ class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
     }
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
-      Tensor input_grad_tensor;
+      phi::DenseTensor input_grad_tensor;
       input_grad_tensor.ShareDataWith(*input_grad);
       if (channel_last) {
         input_grad_tensor.set_layout(DataLayout::kNHWC);
@@ -248,7 +247,7 @@ class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
     std::vector<int> strides(5, 1);
     std::vector<int> dilations(5, 1);
 
-    Tensor input_tensor, output_tensor, filter_tensor;
+    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
     input_tensor.Resize(input->dims());
     input_tensor.ShareDataWith(*input);
     output_tensor.Resize(output->dims());
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index a36e9b73639ba..56f334b66571d 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -30,8 +30,6 @@ class OpBase;
 }  // namespace imperative
 }  // namespace paddle
 
-using Tensor = phi::DenseTensor;
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
index 5587b595cd470..2b3450d031607 100644
--- a/paddle/fluid/operators/correlation_op.cc
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::vector<int64_t> CorrelationOutputSize(int batch,
                                                   int input_height,
                                                   int input_width,
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index e1935f0dae2ad..5d4f11a876585 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -21,13 +21,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
     auto* in_x = context.Input<phi::DenseTensor>("X");
     auto* in_y = context.Input<phi::DenseTensor>("Y");
     auto* out_z = context.Output<phi::DenseTensor>("Out");
@@ -74,7 +72,7 @@ template <typename DeviceContext, typename T>
 class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    // get Tensor
+    // get phi::DenseTensor
     auto* in_x = context.Input<phi::DenseTensor>("X");
     auto* in_y = context.Input<phi::DenseTensor>("Y");
     auto* in_z = context.Input<phi::DenseTensor>("Out");
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index 8980e5f73dee7..916ad89f1e72c 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CropNPUKernel : public framework::OpKernel<T> {
  public:
@@ -71,7 +69,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             x->dims().size()));
 
       // shape memory maybe have gc.
-      Tensor tmp_shape(*shape);
+      phi::DenseTensor tmp_shape(*shape);
       tmp_shape.mutable_data<T>(ctx.GetPlace());
 
       const auto& runner =
@@ -90,7 +88,7 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             "(%d) of the Input(X).",
                             shape_size.size(),
                             x->dims().size()));
-      Tensor tmp_shape(x->dtype());
+      phi::DenseTensor tmp_shape(x->dtype());
       tmp_shape.Resize(phi::make_ddim(shape_size));
       tmp_shape.mutable_data<T>(ctx.GetPlace());
       const auto& runner =
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 8ae6f448d24ba..c581d33091c02 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
@@ -36,8 +34,8 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
 
     int rank = x->dims().size();
     auto label_dims = labels->dims();
-    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
-    Tensor labels_2d, y_2d;
+    phi::DenseTensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    phi::DenseTensor labels_2d, y_2d;
     if (label_dims.size() < rank) {
       labels_2d.ShareDataWith(*labels);
       labels_2d.Resize({phi::product(label_dims), 1});
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 9279cf531d449..c3647d6e8c2d7 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class CTCAlignKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index d436a4b5d531d..97e5eae62ab3b 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -26,8 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T, typename Type>
 bool is_continuous(const Type &weight_list) {
   bool continuous = true;
@@ -41,7 +39,7 @@ bool is_continuous(const Type &weight_list) {
   return continuous;
 }
 
-int size_sum(const std::vector<const Tensor *> &weight_list) {
+int size_sum(const std::vector<const phi::DenseTensor *> &weight_list) {
   int size = 0;
   for (size_t i = 0; i < weight_list.size(); ++i) {
     auto in_size = weight_list[i]->numel();
@@ -53,8 +51,8 @@ int size_sum(const std::vector<const Tensor *> &weight_list) {
 template <typename T>
 void weight_to_tensor(const platform::Place &place,
                       gpuStream_t stream,
-                      const std::vector<const Tensor *> &weight_list,
-                      Tensor *weight) {
+                      const std::vector<const phi::DenseTensor *> &weight_list,
+                      phi::DenseTensor *weight) {
   auto weight_data = weight->data<T>();
   int weight_offset = 0;
   for (size_t i = 0; i < weight_list.size(); ++i) {
@@ -72,11 +70,12 @@ void weight_to_tensor(const platform::Place &place,
 }
 
 template <typename T>
-void weight_to_tensor_list(const platform::Place &place,
-                           gpuStream_t stream,
-                           std::vector<Tensor *> *weight_grad,
-                           const std::vector<const Tensor *> &weight_input,
-                           const Tensor *weight) {
+void weight_to_tensor_list(
+    const platform::Place &place,
+    gpuStream_t stream,
+    std::vector<phi::DenseTensor *> *weight_grad,
+    const std::vector<const phi::DenseTensor *> &weight_input,
+    const phi::DenseTensor *weight) {
   int weight_offset = 0;
   auto *weight_data = weight->data<T>();
   for (size_t i = 0; i < weight_input.size(); ++i) {
@@ -204,15 +203,15 @@ template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *x = ctx.Input<phi::DenseTensor>("Input");
-    const Tensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
-    const Tensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("Input");
+    const phi::DenseTensor *init_h = ctx.Input<phi::DenseTensor>("InitH");
+    const phi::DenseTensor *init_c = ctx.Input<phi::DenseTensor>("InitC");
 
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
-    Tensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
-    Tensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
-    Tensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
-    Tensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *last_h = ctx.Output<phi::DenseTensor>("LastH");
+    phi::DenseTensor *last_c = ctx.Output<phi::DenseTensor>("LastC");
+    phi::DenseTensor *reserve = ctx.Output<phi::DenseTensor>("Reserve");
+    phi::DenseTensor *state_out = ctx.Output<phi::DenseTensor>("StateOut");
 
     const T *x_data = x->data<T>();
     const T *init_h_data = init_h->data<T>();
@@ -256,7 +255,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
 
     size_t workspace_size;
     size_t reserve_size;
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
     T *w_data = nullptr;
     int weight_numel;
     bool w_initialized = false;
@@ -272,7 +271,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     if (!w_initialized) {
       auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
       bool continuous =
-          is_continuous<T, std::vector<const Tensor *>>(weight_list);
+          is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);
       weight_numel = size_sum(weight_list);
 
       if (!continuous) {
@@ -288,7 +287,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
           for (size_t i = 0; i < weight_list.size(); ++i) {
             size_t len = weight_list[i]->numel();
             auto dim = weight_list[i]->dims();
-            const_cast<Tensor *>(weight_list[i])
+            const_cast<phi::DenseTensor *>(weight_list[i])
                 ->ShareDataWith(
                     weight_whole.Slice(static_cast<int64_t>(offset),
                                        static_cast<int64_t>(offset + len)))
@@ -481,12 +480,12 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     int weight_numel = size_sum(weight_list);
     bool continuous =
-        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+        is_continuous<T, std::vector<const phi::DenseTensor *>>(weight_list);
 
     auto stream =
         reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
             .stream();
-    Tensor weight_whole;
+    phi::DenseTensor weight_whole;
     T *weight_data = nullptr;
 
     if (!continuous) {
@@ -497,7 +496,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
       weight_data = const_cast<T *>(weight_list[0]->data<T>());
     }
 
-    Tensor weight_grad;
+    phi::DenseTensor weight_grad;
     phi::funcs::SetConstant<phi::GPUContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
@@ -559,7 +558,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
                   SequenceLength,
                   &workspace_size,
                   &reserve_size,
-                  const_cast<Tensor *>(state_out));
+                  const_cast<phi::DenseTensor *>(state_out));
 
     phi::DenseTensor workspace_data_;
     workspace_data_.mutable_data<uint8_t>(
diff --git a/paddle/fluid/operators/cumsum_op_mlu.cc b/paddle/fluid/operators/cumsum_op_mlu.cc
index 83d9a10af1730..fb586b9585e03 100644
--- a/paddle/fluid/operators/cumsum_op_mlu.cc
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CumSumMLUKernel : public framework::OpKernel<T> {
  public:
@@ -34,7 +32,7 @@ class CumSumMLUKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
 
     phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
-    Tensor flat_x(x->type());
+    phi::DenseTensor flat_x(x->type());
     if (flatten) {
       PADDLE_ENFORCE_EQ(
           axis,
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
index 672a59cf22f59..7126e7ca4cbaf 100644
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static void CumsumImp(const phi::DenseTensor& input,
                       phi::DenseTensor* output,
                       const framework::NPUAttributeMap& attr_input,
@@ -30,7 +28,7 @@ static void CumsumImp(const phi::DenseTensor& input,
           .stream();
   if (framework::TransToProtoVarType(input.dtype()) ==
       framework::proto::VarType::INT64) {
-    Tensor tmp_input;
+    phi::DenseTensor tmp_input;
     tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
     auto dst_acl_dtype =
         ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
@@ -41,7 +39,7 @@ static void CumsumImp(const phi::DenseTensor& input,
                     {{"dst_type", static_cast<int>(dst_acl_dtype)}});
     cast_runner_1.Run(stream);
 
-    Tensor tmp_output;
+    phi::DenseTensor tmp_output;
     tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
@@ -86,7 +84,7 @@ class CumSumNPUKernel : public framework::OpKernel<T> {
               -1,
               axis));
 
-      Tensor new_x(x->type());
+      phi::DenseTensor new_x(x->type());
       new_x.ShareDataWith(*x);
 
       new_x.Resize(phi::make_ddim({x->numel()}));
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index 153b181b4fd6a..11af33df2f61b 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class CVMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 5cac5392f4abb..400e025f82030 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -22,7 +22,6 @@ namespace paddle {
 namespace operators {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm,
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index 9bd5a00b3733f..461575d25b75d 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void CvmComputeKernel(const bool use_cvm,
                       const int64_t item_width,
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 36dc93445df59..6770a7e31c1a5 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
@@ -483,9 +482,9 @@ class DataNormGradOp : public framework::OperatorWithKernel {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Y@GRAD can not be found for computation"));
     }
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
@@ -523,7 +522,7 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
     // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
       d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
@@ -587,12 +586,12 @@ class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 
             EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
             EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
-            Tensor dy_sum;
+            phi::DenseTensor dy_sum;
             dy_sum.Resize({C});
             dy_sum.mutable_data<T>(ctx.GetPlace());
             EigenVectorArrayMap<T> dy_sum_arr(
                 dy_sum.mutable_data<T>(ctx.GetPlace()), C);
-            Tensor dy_mul_x_sub_mean_mul_invstd_sum;
+            phi::DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
             dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
             dy_mul_x_sub_mean_mul_invstd_sum.mutable_data<T>(ctx.GetPlace());
             EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 1b895b0c8daa5..aaccaecc72067 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using phi::PADDLE_CUDA_NUM_THREADS;
 
@@ -166,7 +165,7 @@ class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     const int C = x_dims[1];
 
     // init output
-    Tensor *d_x = nullptr;
+    phi::DenseTensor *d_x = nullptr;
     if (ctx.HasOutput(framework::GradVarName("X"))) {
       d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     }
diff --git a/paddle/fluid/operators/deformable_conv_op_mlu.cc b/paddle/fluid/operators/deformable_conv_op_mlu.cc
index 08969ba98fcd2..f5814efb3f491 100644
--- a/paddle/fluid/operators/deformable_conv_op_mlu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DeformableConvMLUKernel : public framework::OpKernel<T> {
  public:
@@ -58,29 +56,29 @@ class DeformableConvMLUKernel : public framework::OpKernel<T> {
                             im2col_step);
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               offset,
                               &trans_offset,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
                               &trans_filter,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor tmp_output(output->dtype());
+    phi::DenseTensor tmp_output(output->dtype());
     auto output_dims = output->dims();
     tmp_output.mutable_data<T>(
         {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
@@ -167,54 +165,54 @@ class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
                             groups,
                             im2col_step);
 
-    Tensor tmp_input_grad;
+    phi::DenseTensor tmp_input_grad;
     auto input_dims = input->dims();
     tmp_input_grad.mutable_data<T>(
         {input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_filter_grad;
+    phi::DenseTensor tmp_filter_grad;
     auto filter_dims = filter->dims();
     tmp_filter_grad.mutable_data<T>(
         {filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_offset_grad;
+    phi::DenseTensor tmp_offset_grad;
     auto offset_dims = offset->dims();
     tmp_offset_grad.mutable_data<T>(
         {offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
         ctx.GetPlace());
 
-    Tensor tmp_mask_grad;
+    phi::DenseTensor tmp_mask_grad;
     auto mask_dims = mask->dims();
     tmp_mask_grad.mutable_data<T>(
         {mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
         ctx.GetPlace());
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
-    Tensor trans_output_grad(output_grad->dtype());
+    phi::DenseTensor trans_output_grad(output_grad->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               output_grad,
                               &trans_output_grad,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_offset(offset->dtype());
+    phi::DenseTensor trans_offset(offset->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               offset,
                               &trans_offset,
                               true /*need_reshape_or_alloc*/);
 
-    Tensor trans_mask(mask->dtype());
+    phi::DenseTensor trans_mask(mask->dtype());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
 
-    Tensor trans_filter(filter->dtype());
+    phi::DenseTensor trans_filter(filter->dtype());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               filter,
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 80d248b818b4f..0e8c736431b11 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -39,7 +39,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 static inline int GET_BLOCKS(const int N) {
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 231d14e537b54..dabb69b5af8c1 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -33,8 +33,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 T bilinear_interp(
     const T* data, const T x, const T y, const int width, const int height) {
@@ -518,7 +516,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
     const int num_classes = no_trans ? 1 : channels_trans / 2;
     const int channels_each_class =
         no_trans ? output_dim : output_dim / num_classes;
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({num_rois});
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index a9ad6cdfb659d..adb60a8a8d064 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -30,8 +30,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 
 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
@@ -47,11 +45,11 @@ struct RangeInitFunctor {
 
 template <typename T>
 static void SortDescending(const phi::GPUContext &ctx,
-                           const Tensor &value,
-                           Tensor *value_out,
-                           Tensor *index_out) {
+                           const phi::DenseTensor &value,
+                           phi::DenseTensor *value_out,
+                           phi::DenseTensor *index_out) {
   int num = static_cast<int>(value.numel());
-  Tensor index_in_t;
+  phi::DenseTensor index_in_t;
   int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
   platform::ForRange<phi::GPUContext> for_range(ctx, num);
   for_range(RangeInitFunctor{0, 1, idx_in});
@@ -287,10 +285,10 @@ static __global__ void NMSKernel(const int n_boxes,
 
 template <typename T>
 static void NMS(const phi::GPUContext &ctx,
-                const Tensor &proposals,
-                const Tensor &sorted_indices,
+                const phi::DenseTensor &proposals,
+                const phi::DenseTensor &sorted_indices,
                 const T nms_threshold,
-                Tensor *keep_out,
+                phi::DenseTensor *keep_out,
                 bool pixel_offset = true) {
   int boxes_num = proposals.dims()[0];
   const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 35368d0034221..583122b473d26 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class BipartiteMatchOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -234,7 +232,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
       auto lod = dist_mat->lod().back();
       for (size_t i = 0; i < lod.size() - 1; ++i) {
         if (lod[i + 1] > lod[i]) {
-          Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+          phi::DenseTensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
           BipartiteMatch(one_ins, indices + i * col, dist + i * col);
           if (type == "per_prediction") {
             ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 089f2f5569234..79f3b18b2dfce 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoDTenso = phi::DenseTensor;
 
 static constexpr int ImInfoSize = 3;
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index bb72ca194b54c..cb067f91662ed 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class BoxClipKernel : public framework::OpKernel<T> {
  public:
@@ -42,9 +40,10 @@ class BoxClipKernel : public framework::OpKernel<T> {
     auto box_lod = input_box->lod().back();
     int64_t n = static_cast<int64_t>(box_lod.size() - 1);
     for (int i = 0; i < n; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
-      Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]);
+      phi::DenseTensor output_slice =
+          output_box->Slice(box_lod[i], box_lod[i + 1]);
       ClipTiledBoxes<T>(dev_ctx, im_info_slice, box_slice, &output_slice);
     }
   }
diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc
index 865f218170f45..089f58558ae73 100644
--- a/paddle/fluid/operators/detection/box_coder_op_npu.cc
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct BoxCoderFunction {
  public:
@@ -28,31 +26,31 @@ struct BoxCoderFunction {
     stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
                  .stream();
   }
-  Tensor Adds(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Adds(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Muls(const phi::DenseTensor& x, float scalar) {
-    Tensor y;
+  phi::DenseTensor Muls(const phi::DenseTensor& x, float scalar) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
     runner.Run(stream);
     return y;
   }
-  Tensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
-    Tensor z;
+  phi::DenseTensor Mul(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+    phi::DenseTensor z;
     z.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
     runner.Run(stream);
     return z;
   }
-  Tensor SubWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor SubWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     z.mutable_data<T>(shape, place);
     const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
     runner.Run(stream);
@@ -66,10 +64,10 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor DivWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor DivWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     DivWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
@@ -81,10 +79,10 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor MulWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor MulWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     MulWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
@@ -96,36 +94,36 @@ struct BoxCoderFunction {
     const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
     runner.Run(stream);
   }
-  Tensor AddWithBroadCast(const phi::DenseTensor& x,
-                          const phi::DenseTensor& y,
-                          const framework::DDim& shape) {
-    Tensor z;
+  phi::DenseTensor AddWithBroadCast(const phi::DenseTensor& x,
+                                    const phi::DenseTensor& y,
+                                    const framework::DDim& shape) {
+    phi::DenseTensor z;
     AddWithBroadCastVoid(x, y, shape, &z);
     return z;
   }
-  Tensor Abs(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Abs(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Log(const phi::DenseTensor& x) {
-    Tensor t_x_m1 = Adds(x, -1);
-    Tensor y;
+  phi::DenseTensor Log(const phi::DenseTensor& x) {
+    phi::DenseTensor t_x_m1 = Adds(x, -1);
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Exp(const phi::DenseTensor& x) {
-    Tensor y;
+  phi::DenseTensor Exp(const phi::DenseTensor& x) {
+    phi::DenseTensor y;
     y.mutable_data<T>(x.dims(), place);
     const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
     runner.Run(stream);
     return y;
   }
-  Tensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+  phi::DenseTensor Dot(const phi::DenseTensor& x, const phi::DenseTensor& y) {
     auto dim_x = x.dims();
     auto dim_y = y.dims();
     PADDLE_ENFORCE_EQ(
@@ -145,7 +143,7 @@ struct BoxCoderFunction {
                                           "got dim_x[1] = %d, dim_y[0] = %d.",
                                           dim_x[1],
                                           dim_y[0]));
-    Tensor z;
+    phi::DenseTensor z;
     z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
     const auto& runner =
         NpuOpRunner("MatMul",
@@ -155,7 +153,7 @@ struct BoxCoderFunction {
     runner.Run(stream);
     return z;
   }
-  void ConcatVoid(const std::vector<Tensor>& inputs,
+  void ConcatVoid(const std::vector<phi::DenseTensor>& inputs,
                   const framework::DDim& shape_out,
                   int axis,
                   phi::DenseTensor* output) {
@@ -172,18 +170,18 @@ struct BoxCoderFunction {
     runner.AddInputNames(names);
     runner.Run(stream);
   }
-  Tensor Concat(const std::vector<Tensor>& inputs,
-                const framework::DDim& shape_out,
-                int axis) {
-    Tensor output;
+  phi::DenseTensor Concat(const std::vector<phi::DenseTensor>& inputs,
+                          const framework::DDim& shape_out,
+                          int axis) {
+    phi::DenseTensor output;
     ConcatVoid(inputs, shape_out, axis, &output);
     return output;
   }
-  Tensor Slice(const phi::DenseTensor& x,
-               const std::vector<int>& offsets,
-               const std::vector<int>& size,
-               const framework::DDim& shape) {
-    Tensor y;
+  phi::DenseTensor Slice(const phi::DenseTensor& x,
+                         const std::vector<int>& offsets,
+                         const std::vector<int>& size,
+                         const framework::DDim& shape) {
+    phi::DenseTensor y;
     y.mutable_data<T>(shape, place);
     const auto& runner =
         NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
@@ -218,8 +216,8 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   auto M = pb->dims()[0];
   auto N = tb->dims()[0];
   auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
   std::vector<T> vec_diff = {static_cast<T>(-1),
                              static_cast<T>(0),
                              static_cast<T>(0),
@@ -240,10 +238,10 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
 
   BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
-  Tensor tb_xy = F.Dot(*tb, m_aver);
-  Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor tb_xy = F.Dot(*tb, m_aver);
+  phi::DenseTensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
 
   pb_xy.Resize({1, M, 2});
   pb_wh.Resize({1, M, 2});
@@ -253,15 +251,16 @@ void BoxCoderEnc(const framework::ExecutionContext& ctx,
   auto shape_half = phi::make_ddim({N, M, 2});
   auto shape_full = phi::make_ddim({N, M, 4});
 
-  Tensor out_xy_0 = F.DivWithBroadCast(
+  phi::DenseTensor out_xy_0 = F.DivWithBroadCast(
       F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
-  Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
-  Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
+  phi::DenseTensor out_wh_0 =
+      F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
+  phi::DenseTensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
 
   if (pbv) {
     F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
   } else {
-    Tensor t_var;
+    phi::DenseTensor t_var;
     std::vector<T> vec_var(4);
     for (auto i = 0; i < 4; i++) {
       vec_var[i] = static_cast<T>(variance[i]);
@@ -281,8 +280,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                  int axis,
                  phi::DenseTensor* out) {
   auto shape_0 = phi::make_ddim({4, 2});
-  Tensor m_diff;
-  Tensor m_aver;
+  phi::DenseTensor m_diff;
+  phi::DenseTensor m_aver;
   std::vector<T> vec_diff = {static_cast<T>(-1),
                              static_cast<T>(0),
                              static_cast<T>(0),
@@ -303,8 +302,8 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
   Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
 
   BoxCoderFunction<T> F(ctx);
-  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
-  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  phi::DenseTensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  phi::DenseTensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
   auto pb_resize_shape = axis == 0 ? phi::make_ddim({1, pb->dims()[0], 2})
                                    : phi::make_ddim({pb->dims()[0], 1, 2});
   pb_xy.Resize(pb_resize_shape);
@@ -313,18 +312,22 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
   auto tbox_slice_shape = phi::make_ddim({tb->dims()[0], tb->dims()[1], 2});
   std::vector<int> tbox_slice_size = {
       static_cast<int>(tb->dims()[0]), static_cast<int>(tb->dims()[1]), 2};
-  Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
-  Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox01 =
+      F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
+  phi::DenseTensor tbox23 =
+      F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
 
-  Tensor tb_xy;
-  Tensor tb_wh;
+  phi::DenseTensor tb_xy;
+  phi::DenseTensor tb_wh;
   if (pbv) {
     auto pbvt_slice_shape = phi::make_ddim({pbv->dims()[0], 2});
     auto pbvt_resize_shape = axis == 0 ? phi::make_ddim({1, pbv->dims()[0], 2})
                                        : phi::make_ddim({pbv->dims()[0], 1, 2});
     std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
-    Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
-    Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t01 =
+        F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
+    phi::DenseTensor pbv_t23 =
+        F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
     pbv_t01.Resize(pbvt_resize_shape);
     pbv_t23.Resize(pbvt_resize_shape);
 
@@ -345,7 +348,7 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
                            &tb_xy);
     F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
   } else {
-    Tensor t_var01, t_var23;
+    phi::DenseTensor t_var01, t_var23;
     auto t_var_shape = phi::make_ddim({1, 1, 2});
     std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
                                 static_cast<T>(variance[1])};
@@ -366,9 +369,9 @@ void BoxCoderDec(const framework::ExecutionContext& ctx,
         tbox_slice_shape,
         &tb_wh);
   }
-  Tensor obox01 =
+  phi::DenseTensor obox01 =
       F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
-  Tensor obox23 =
+  phi::DenseTensor obox23 =
       F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
              (norm ? 0 : -1));
   F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index 37dc10df7292a..e07e4034f330f 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -16,7 +16,6 @@ limitations under the License.*/
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class CollectFpnProposalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index b517f2ec1fdd3..29cf8da067f84 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -33,8 +33,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -74,13 +72,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     int real_post_num = min(post_nms_topN, total_roi_num);
     fpn_rois->mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor concat_rois;
-    Tensor concat_scores;
+    phi::DenseTensor concat_rois;
+    phi::DenseTensor concat_scores;
     T* concat_rois_data = concat_rois.mutable_data<T>(
         {total_roi_num, kBBoxSize}, dev_ctx.GetPlace());
     T* concat_scores_data =
         concat_scores.mutable_data<T>({total_roi_num, 1}, dev_ctx.GetPlace());
-    Tensor roi_batch_id_list;
+    phi::DenseTensor roi_batch_id_list;
     roi_batch_id_list.Resize({total_roi_num});
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
@@ -130,20 +128,20 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
 
     // copy batch id list to GPU
-    Tensor roi_batch_id_list_gpu;
+    phi::DenseTensor roi_batch_id_list_gpu;
     framework::TensorCopy(
         roi_batch_id_list, dev_ctx.GetPlace(), &roi_batch_id_list_gpu);
 
-    Tensor index_in_t;
+    phi::DenseTensor index_in_t;
     int* idx_in =
         index_in_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
     platform::ForRange<phi::GPUContext> for_range_total(dev_ctx, total_roi_num);
     for_range_total(RangeInitFunctor{0, 1, idx_in});
 
-    Tensor keys_out_t;
+    phi::DenseTensor keys_out_t;
     T* keys_out =
         keys_out_t.mutable_data<T>({total_roi_num}, dev_ctx.GetPlace());
-    Tensor index_out_t;
+    phi::DenseTensor index_out_t;
     int* idx_out =
         index_out_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
 
@@ -175,21 +173,21 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
                                                       sizeof(T) * 8,
                                                       dev_ctx.stream());
     index_out_t.Resize({real_post_num});
-    Tensor sorted_rois;
+    phi::DenseTensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
-    Tensor sorted_batch_id;
+    phi::DenseTensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
     phi::funcs::GPUGather<int>(
         dev_ctx, roi_batch_id_list_gpu, index_out_t, &sorted_batch_id);
 
-    Tensor batch_index_t;
+    phi::DenseTensor batch_index_t;
     int* batch_idx_in =
         batch_index_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     platform::ForRange<phi::GPUContext> for_range_post(dev_ctx, real_post_num);
     for_range_post(RangeInitFunctor{0, 1, batch_idx_in});
 
-    Tensor out_id_t;
+    phi::DenseTensor out_id_t;
     int* out_id_data =
         out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     // Determine temporary device storage requirements
@@ -222,7 +220,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
-    Tensor length_lod;
+    phi::DenseTensor length_lod;
     int* length_lod_data =
         length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
     phi::funcs::SetConstant<phi::GPUContext, int> set_zero;
diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
index a6f9170712d96..d1a609ad45de6 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using fp16 = paddle::platform::float16;
 
 template <typename T>
@@ -89,7 +88,7 @@ struct DensityPriorBoxFunction {
     const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
     runner.Run(stream);
   }
-  void Concat(const std::vector<Tensor>& inputs,
+  void Concat(const std::vector<phi::DenseTensor>& inputs,
               int axis,
               phi::DenseTensor* output) {
     //  output should be init first
@@ -131,14 +130,14 @@ struct DensityPriorBoxFunction {
   platform::Place place;
   aclrtStream stream;
   const framework::ExecutionContext& ctx;
-  Tensor t0;
-  Tensor t1;
-  Tensor tn;
+  phi::DenseTensor t0;
+  phi::DenseTensor t1;
+  phi::DenseTensor tn;
 };
 
 template <>
 void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  Tensor x_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
   const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
@@ -149,7 +148,7 @@ void DensityPriorBoxFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
 template <>
 void DensityPriorBoxFunction<fp16>::FloatVec2Tsr(const std::vector<float>& vec,
                                                  phi::DenseTensor* tsr_dst) {
-  Tensor tsr_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor tsr_fp32(experimental::DataType::FLOAT32);
   tsr_fp32.mutable_data<float>(tsr_dst->dims(), place);
   framework::TensorFromVector<float>(vec, ctx.device_context(), &tsr_fp32);
   ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
@@ -185,9 +184,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     DensityPriorBoxFunction<T> F(ctx);
 
-    Tensor h(_type);
+    phi::DenseTensor h(_type);
     h.mutable_data<T>({layer_h}, place);
-    Tensor w(_type);
+    phi::DenseTensor w(_type);
     w.mutable_data<T>({layer_w}, place);
     F.Arange(layer_h, &h);
     F.Arange(layer_w, &w);
@@ -203,11 +202,11 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < densities.size(); ++i) {
       num_priors_per_ratio += densities[i] * densities[i];
     }
-    Tensor di(_type);
-    Tensor dj(_type);
-    Tensor shifts(_type);
-    Tensor box_w_ratio(_type);
-    Tensor box_h_ratio(_type);
+    phi::DenseTensor di(_type);
+    phi::DenseTensor dj(_type);
+    phi::DenseTensor shifts(_type);
+    phi::DenseTensor box_w_ratio(_type);
+    phi::DenseTensor box_h_ratio(_type);
     di.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
     dj.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
     shifts.mutable_data<T>({ratios_size * num_priors_per_ratio}, place);
@@ -220,19 +219,21 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
       //  Range = start:start+ratios_size*density_sqr, density = densities[i]
       int density_sqr = densities[i] * densities[i];
       //  shifts[Range] = [step_average/density]*ratios_size*density_sqr
-      Tensor shifts_part =
+      phi::DenseTensor shifts_part =
           shifts.Slice(start, start + ratios_size * density_sqr);
       FillNpuTensorWithConstant<T>(&shifts_part,
                                    static_cast<T>(step_average / densities[i]));
 
       //  di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size
       //  dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size
-      Tensor di_part = di.Slice(start, start + ratios_size * density_sqr);
-      Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor di_part =
+          di.Slice(start, start + ratios_size * density_sqr);
+      phi::DenseTensor dj_part =
+          dj.Slice(start, start + ratios_size * density_sqr);
       if (densities[i] > 1) {
         di_part.Resize({ratios_size, densities[i], densities[i]});
         dj_part.Resize({ratios_size, densities[i], densities[i]});
-        Tensor range_n(_type);
+        phi::DenseTensor range_n(_type);
         range_n.mutable_data<T>({densities[i]}, place);
         F.Arange(densities[i], &range_n);
         range_n.Resize({1, densities[i], 1});
@@ -254,9 +255,9 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
         //  Range_mini = start_box_ratio:start_box_ratio+density_sqr
         //  box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)]  * density_sqr
         //  box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)]  * density_sqr
-        Tensor box_h_ratio_part =
+        phi::DenseTensor box_h_ratio_part =
             box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
-        Tensor box_w_ratio_part =
+        phi::DenseTensor box_w_ratio_part =
             box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr);
         FillNpuTensorWithConstant<T>(&box_w_ratio_part,
                                      static_cast<T>(fixed_sizes[i] * sqrt(ar)));
@@ -274,8 +275,8 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
 
     //  c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts
     //  c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts
-    Tensor c_x(_type);
-    Tensor c_y(_type);
+    phi::DenseTensor c_x(_type);
+    phi::DenseTensor c_y(_type);
     auto dim0 =
         phi::make_ddim({1, layer_w, ratios_size * num_priors_per_ratio, 1});
     auto dim1 =
@@ -301,17 +302,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
     F.Muls(&box_w_ratio, static_cast<float>(0.5), &box_w_ratio);
     F.Muls(&box_h_ratio, static_cast<float>(0.5), &box_h_ratio);
 
-    Tensor zero_t(_type);
-    Tensor one_t(_type);
+    phi::DenseTensor zero_t(_type);
+    phi::DenseTensor one_t(_type);
     zero_t.mutable_data<T>({1}, place);
     one_t.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
     FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
 
-    Tensor outbox0(_type);
-    Tensor outbox1(_type);
-    Tensor outbox2(_type);
-    Tensor outbox3(_type);
+    phi::DenseTensor outbox0(_type);
+    phi::DenseTensor outbox1(_type);
+    phi::DenseTensor outbox2(_type);
+    phi::DenseTensor outbox3(_type);
     outbox0.mutable_data<T>(dim0, place);
     outbox1.mutable_data<T>(dim1, place);
     outbox2.mutable_data<T>(dim0, place);
@@ -349,17 +350,17 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
         {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4});
     boxes->mutable_data<T>(place);
     vars->mutable_data<T>(place);
-    Tensor boxes_share(_type);
-    Tensor vars_share(_type);
+    phi::DenseTensor boxes_share(_type);
+    phi::DenseTensor vars_share(_type);
     boxes_share.ShareDataWith(*boxes);
     boxes_share.Resize(out_dim);
     vars_share.ShareDataWith(*vars);
     vars_share.Resize(out_dim);
 
-    Tensor box0(_type);
-    Tensor box1(_type);
-    Tensor box2(_type);
-    Tensor box3(_type);
+    phi::DenseTensor box0(_type);
+    phi::DenseTensor box1(_type);
+    phi::DenseTensor box2(_type);
+    phi::DenseTensor box3(_type);
     // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1}
     out_dim[3] = 1;
     box0.mutable_data<T>(out_dim, place);
@@ -377,7 +378,7 @@ class DensityPriorBoxOpNPUKernel : public framework::OpKernel<T> {
 
     std::vector<int> multiples = {
         layer_h, layer_w, ratios_size * num_priors_per_ratio, 1};
-    Tensor variances_t(_type);
+    phi::DenseTensor variances_t(_type);
     //  variances.size() == 4
     variances_t.mutable_data<T>({4}, place);
     F.FloatVec2Tsr(variances, &variances_t);
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index f14768168a425..7ae5ba6ca8f9c 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;
 
 template <typename T>
@@ -151,16 +150,17 @@ static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
 }
 
 template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
-                                          const phi::DenseTensor& im_info,
-                                          const phi::DenseTensor& gt_classes,
-                                          const phi::DenseTensor& is_crowd,
-                                          const phi::DenseTensor& gt_segms,
-                                          const phi::DenseTensor& rois,
-                                          const phi::DenseTensor& label_int32,
-                                          const int num_classes,
-                                          const int resolution,
-                                          const framework::LoD& segm_length) {
+std::vector<phi::DenseTensor> SampleMaskForOneImage(
+    const phi::CPUContext& ctx,
+    const phi::DenseTensor& im_info,
+    const phi::DenseTensor& gt_classes,
+    const phi::DenseTensor& is_crowd,
+    const phi::DenseTensor& gt_segms,
+    const phi::DenseTensor& rois,
+    const phi::DenseTensor& label_int32,
+    const int num_classes,
+    const int resolution,
+    const framework::LoD& segm_length) {
   // Prepare the mask targets by associating one gt mask to each training roi
   // that has a fg (non-bg) class label.
   const int64_t gt_size = static_cast<int64_t>(gt_classes.dims()[0]);
@@ -218,15 +218,15 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
   int gt_num = mask_gt_inds.size();
   int fg_num = fg_inds.size();
 
-  Tensor boxes_from_polys;
+  phi::DenseTensor boxes_from_polys;
   boxes_from_polys.mutable_data<T>({gt_num, 4}, platform::CPUPlace());
   Poly2Boxes(gt_polys, boxes_from_polys.data<T>());
 
   std::vector<int> roi_has_mask =
       std::vector<int>(fg_inds.begin(), fg_inds.end());
-  Tensor mask_class_labels;
-  Tensor masks;
-  Tensor rois_fg;
+  phi::DenseTensor mask_class_labels;
+  phi::DenseTensor masks;
+  phi::DenseTensor rois_fg;
 
   auto im_scale = im_info.data<T>()[2];
   if (fg_num > 0) {
@@ -251,7 +251,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
       rois_fg_data[k] = rois_fg_data[k] / im_scale;
     }
 
-    Tensor overlaps_bbfg_bbpolys;
+    phi::DenseTensor overlaps_bbfg_bbpolys;
     overlaps_bbfg_bbpolys.mutable_data<T>({fg_num, gt_num}, ctx.GetPlace());
     BboxOverlaps<T>(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys);
 
@@ -306,7 +306,7 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
     roi_has_mask = std::vector<int>(bg_inds.begin(), bg_inds.end());
   }
 
-  Tensor masks_expand;
+  phi::DenseTensor masks_expand;
   ExpandMaskTarget<T>(
       ctx, masks, mask_class_labels, resolution, num_classes, &masks_expand);
 
@@ -315,13 +315,13 @@ std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
     rois_fg_data[k] = rois_fg_data[k] * im_scale;
   }
 
-  Tensor roi_has_mask_t;
+  phi::DenseTensor roi_has_mask_t;
   int roi_has_mask_size = roi_has_mask.size();
   int* roi_has_mask_data =
       roi_has_mask_t.mutable_data<int>({roi_has_mask_size, 1}, ctx.GetPlace());
   std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data);
 
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(rois_fg);
   res.emplace_back(roi_has_mask_t);
   res.emplace_back(masks_expand);
@@ -405,23 +405,23 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
         lod0.emplace_back(num_mask);
         continue;
       }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor gt_classes_slice =
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor label_int32_slice =
+      phi::DenseTensor label_int32_slice =
           label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]);
-      Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
+      phi::DenseTensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]);
 
       auto sub_lod_and_offset =
           framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0);
       auto lod_length = sub_lod_and_offset.first;
       size_t s = sub_lod_and_offset.second.first;
       size_t e = sub_lod_and_offset.second.second;
-      Tensor gt_segms_slice = gt_segms->Slice(s, e);
+      phi::DenseTensor gt_segms_slice = gt_segms->Slice(s, e);
 
-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
           SampleMaskForOneImage<T>(dev_ctx,
                                    im_info_slice,
                                    gt_classes_slice,
@@ -433,9 +433,9 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
                                    resolution,
                                    lod_length);
 
-      Tensor sampled_mask_rois = tensor_output[0];
-      Tensor sampled_roi_has_mask_int32 = tensor_output[1];
-      Tensor sampled_mask_int32 = tensor_output[2];
+      phi::DenseTensor sampled_mask_rois = tensor_output[0];
+      phi::DenseTensor sampled_roi_has_mask_int32 = tensor_output[1];
+      phi::DenseTensor sampled_mask_int32 = tensor_output[2];
 
       AppendMask<T>(mask_rois, kBoxDim * num_mask, &sampled_mask_rois);
       AppendMask<int>(
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 1071641b6bc60..b11030f1d086a 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kBoxDim = 4;
 
 template <typename T>
@@ -174,7 +173,7 @@ void Concat(const phi::CPUContext& context,
             const phi::DenseTensor& in_tensor_b,
             phi::DenseTensor* out_tensor) {
   int axis = 0;
-  std::vector<Tensor> inputs;
+  std::vector<phi::DenseTensor> inputs;
   inputs.emplace_back(in_tensor_a);
   inputs.emplace_back(in_tensor_b);
   math::ConcatFunctor<phi::CPUContext, T> concat_functor;
@@ -300,7 +299,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
                        phi::DenseTensor* sampled_max_overlap) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
+  phi::DenseTensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
   int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
   int* gt_box_inds_data =
@@ -312,7 +311,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
   std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data);
   std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data);
 
-  Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
+  phi::DenseTensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
   phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
@@ -325,7 +324,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
-  Tensor fg_max_overlap, bg_max_overlap;
+  phi::DenseTensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
   phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
@@ -334,7 +333,7 @@ void GatherBoxesLabels(const phi::CPUContext& context,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRoisForOneImage(
+std::vector<phi::DenseTensor> SampleRoisForOneImage(
     const phi::CPUContext& context,
     const phi::DenseTensor& rpn_rois_in,
     const phi::DenseTensor& gt_classes,
@@ -355,7 +354,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     const phi::DenseTensor& max_overlap) {
   // 1.1 map to original image
   auto im_scale = im_info.data<T>()[2];
-  Tensor rpn_rois;
+  phi::DenseTensor rpn_rois;
   rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
   const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
   T* rpn_rois_dt = rpn_rois.data<T>();
@@ -367,10 +366,10 @@ std::vector<Tensor> SampleRoisForOneImage(
   int proposals_num = 1;
 
   if (is_cascade_rcnn) {
-    Tensor keep;
+    phi::DenseTensor keep;
     FilterRoIs<T>(context, rpn_rois, max_overlap, &keep);
-    Tensor roi_filter;
-    // Tensor box_filter;
+    phi::DenseTensor roi_filter;
+    // phi::DenseTensor box_filter;
     if (keep.numel() == 0) {
       phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
@@ -389,16 +388,16 @@ std::vector<Tensor> SampleRoisForOneImage(
   // 1.2 compute overlaps
   proposals_num += gt_boxes.dims()[0];
 
-  Tensor proposal_to_gt_overlaps;
+  phi::DenseTensor proposal_to_gt_overlaps;
   proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
 
-  Tensor boxes;
+  phi::DenseTensor boxes;
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
   Concat<T>(context, gt_boxes, rpn_rois, &boxes);
   BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
-  Tensor proposal_with_max_overlap;
+  phi::DenseTensor proposal_with_max_overlap;
   proposal_with_max_overlap.mutable_data<T>({proposals_num},
                                             context.GetPlace());
 
@@ -423,7 +422,8 @@ std::vector<Tensor> SampleRoisForOneImage(
   std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
 
   // Gather boxes and labels
-  Tensor sampled_boxes, sampled_labels, sampled_gts, sampled_max_overlap;
+  phi::DenseTensor sampled_boxes, sampled_labels, sampled_gts,
+      sampled_max_overlap;
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
   int boxes_num = fg_num + bg_num;
@@ -446,7 +446,7 @@ std::vector<Tensor> SampleRoisForOneImage(
                        &sampled_max_overlap);
 
   // Compute targets
-  Tensor bbox_targets_single;
+  phi::DenseTensor bbox_targets_single;
   bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
   BoxToDelta<T>(fg_num,
                 sampled_boxes,
@@ -456,14 +456,14 @@ std::vector<Tensor> SampleRoisForOneImage(
                 &bbox_targets_single);
 
   // Scale rois
-  Tensor sampled_rois;
+  phi::DenseTensor sampled_rois;
   sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
   auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
   auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
   sampled_rois_et = sampled_boxes_et * im_scale;
 
   // Expand box targets
-  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
+  phi::DenseTensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
   framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums});
   bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
@@ -500,7 +500,7 @@ std::vector<Tensor> SampleRoisForOneImage(
       bbox_outside_weights_data[dst_idx + 3] = 1;
     }
   }
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(sampled_rois);
   res.emplace_back(sampled_labels);
   res.emplace_back(bbox_targets);
@@ -610,16 +610,16 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
         lod0.emplace_back(num_rois);
         continue;
       }
-      Tensor rpn_rois_slice =
+      phi::DenseTensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
-      Tensor gt_classes_slice =
+      phi::DenseTensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor max_overlap_slice;
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor max_overlap_slice;
       if (is_cascade_rcnn) {
         auto* max_overlap = context.Input<phi::DenseTensor>("MaxOverlap");
         max_overlap_slice =
@@ -628,7 +628,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
         max_overlap_slice.mutable_data<T>({rpn_rois_slice.dims()[0]},
                                           context.GetPlace());
       }
-      std::vector<Tensor> tensor_output =
+      std::vector<phi::DenseTensor> tensor_output =
           SampleRoisForOneImage<T>(dev_ctx,
                                    rpn_rois_slice,
                                    gt_classes_slice,
@@ -647,12 +647,12 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
                                    is_cascade_rcnn,
                                    is_cls_agnostic,
                                    max_overlap_slice);
-      Tensor sampled_rois = tensor_output[0];
-      Tensor sampled_labels_int32 = tensor_output[1];
-      Tensor sampled_bbox_targets = tensor_output[2];
-      Tensor sampled_bbox_inside_weights = tensor_output[3];
-      Tensor sampled_bbox_outside_weights = tensor_output[4];
-      Tensor sampled_max_overlap = tensor_output[5];
+      phi::DenseTensor sampled_rois = tensor_output[0];
+      phi::DenseTensor sampled_labels_int32 = tensor_output[1];
+      phi::DenseTensor sampled_bbox_targets = tensor_output[2];
+      phi::DenseTensor sampled_bbox_inside_weights = tensor_output[3];
+      phi::DenseTensor sampled_bbox_outside_weights = tensor_output[4];
+      phi::DenseTensor sampled_max_overlap = tensor_output[5];
 
       AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
       AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 6491c8b8fcece..030b99cd1dbd7 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -115,7 +113,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                               context.GetPlace());
     rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
 
-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                      dev_ctx.GetPlace());
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -136,14 +134,14 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> tensor_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> tensor_pair =
           ProposalForOneImage(dev_ctx,
                               im_info_slice,
                               anchors,
@@ -155,8 +153,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
                               nms_thresh,
                               min_size,
                               eta);
-      Tensor &proposals = tensor_pair.first;
-      Tensor &scores = tensor_pair.second;
+      phi::DenseTensor &proposals = tensor_pair.first;
+      phi::DenseTensor &scores = tensor_pair.second;
 
       AppendProposals(rpn_rois, 4 * num_proposals, proposals);
       AppendProposals(rpn_roi_probs, num_proposals, scores);
@@ -179,13 +177,13 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     rpn_roi_probs->Resize({num_proposals, 1});
   }
 
-  std::pair<Tensor, Tensor> ProposalForOneImage(
+  std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
       const phi::CPUContext &ctx,
-      const Tensor &im_info_slice,
-      const Tensor &anchors,
-      const Tensor &variances,
-      const Tensor &bbox_deltas_slice,  // [M, 4]
-      const Tensor &scores_slice,       // [N, 1]
+      const phi::DenseTensor &im_info_slice,
+      const phi::DenseTensor &anchors,
+      const phi::DenseTensor &variances,
+      const phi::DenseTensor &bbox_deltas_slice,  // [M, 4]
+      const phi::DenseTensor &scores_slice,       // [N, 1]
       int pre_nms_top_n,
       int post_nms_top_n,
       float nms_thresh,
@@ -194,7 +192,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     auto *scores_data = scores_slice.data<T>();
 
     // Sort index
-    Tensor index_t;
+    phi::DenseTensor index_t;
     index_t.Resize({scores_slice.numel()});
     int *index = index_t.mutable_data<int>(ctx.GetPlace());
     for (int i = 0; i < scores_slice.numel(); ++i) {
@@ -212,7 +210,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       index_t.Resize({pre_nms_top_n});
     }
 
-    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    phi::DenseTensor scores_sel, bbox_sel, anchor_sel, var_sel;
     scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
     bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,26 +221,26 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
     phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
-    Tensor proposals;
+    phi::DenseTensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
 
     ClipTiledBoxes<T>(ctx, im_info_slice, proposals, &proposals, false);
 
-    Tensor keep;
+    phi::DenseTensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
       phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
-      Tensor scores_filter;
+      phi::DenseTensor scores_filter;
       scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
       set_zero(ctx, &scores_filter, static_cast<T>(0));
       return std::make_pair(bbox_sel, scores_filter);
     }
 
-    Tensor scores_filter;
+    phi::DenseTensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
     phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
@@ -251,7 +249,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       return std::make_pair(bbox_sel, scores_filter);
     }
 
-    Tensor keep_nms =
+    phi::DenseTensor keep_nms =
         phi::funcs::NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
 
     if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 0890ff493332c..5d7a034c28a8f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -28,24 +28,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 namespace {
 template <typename T>
-static std::pair<Tensor, Tensor> ProposalForOneImage(
+static std::pair<phi::DenseTensor, phi::DenseTensor> ProposalForOneImage(
     const phi::GPUContext &ctx,
-    const Tensor &im_info,
-    const Tensor &anchors,
-    const Tensor &variances,
-    const Tensor &bbox_deltas,  // [M, 4]
-    const Tensor &scores,       // [N, 1]
+    const phi::DenseTensor &im_info,
+    const phi::DenseTensor &anchors,
+    const phi::DenseTensor &variances,
+    const phi::DenseTensor &bbox_deltas,  // [M, 4]
+    const phi::DenseTensor &scores,       // [N, 1]
     int pre_nms_top_n,
     int post_nms_top_n,
     float nms_thresh,
     float min_size,
     float eta) {
   // 1. pre nms
-  Tensor scores_sort, index_sort;
+  phi::DenseTensor scores_sort, index_sort;
   SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
   int num = scores.numel();
   int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
@@ -54,7 +52,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   index_sort.Resize({pre_nms_num, 1});
 
   // 2. box decode and clipping
-  Tensor proposals;
+  phi::DenseTensor proposals;
   proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
 
   {
@@ -68,7 +66,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
 
   // 3. filter
-  Tensor keep_index, keep_num_t;
+  phi::DenseTensor keep_index, keep_num_t;
   keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
   keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
   min_size = std::max(min_size, 1.0f);
@@ -90,7 +88,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   ctx.Wait();
   keep_index.Resize({keep_num});
 
-  Tensor scores_filter, proposals_filter;
+  phi::DenseTensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
@@ -110,13 +108,13 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
 
   // 4. nms
-  Tensor keep_nms;
+  phi::DenseTensor keep_nms;
   NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
   if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
     keep_nms.Resize({post_nms_top_n});
   }
 
-  Tensor scores_nms, proposals_nms;
+  phi::DenseTensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
   phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
@@ -171,7 +169,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     int64_t h_bbox = bbox_dim[2];
     int64_t w_bbox = bbox_dim[3];
 
-    Tensor bbox_deltas_swap, scores_swap;
+    phi::DenseTensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
                                      dev_ctx.GetPlace());
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
@@ -200,14 +198,14 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
-      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      phi::DenseTensor scores_slice = scores_swap.Slice(i, i + 1);
 
       bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
       scores_slice.Resize({h_score * w_score * c_score, 1});
 
-      std::pair<Tensor, Tensor> box_score_pair =
+      std::pair<phi::DenseTensor, phi::DenseTensor> box_score_pair =
           ProposalForOneImage<T>(dev_ctx,
                                  im_info_slice,
                                  anchors,
@@ -220,8 +218,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
                                  min_size,
                                  eta);
 
-      Tensor &proposals = box_score_pair.first;
-      Tensor &scores = box_score_pair.second;
+      phi::DenseTensor &proposals = box_score_pair.first;
+      phi::DenseTensor &scores = box_score_pair.second;
 
       memory::Copy(place,
                    rpn_rois_data + num_proposals * 4,
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 08c7a649c1e1f..0445c21b1de3b 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -29,8 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class GenerateProposalsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
index 22bba5c57ffd8..2909c333e16ac 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
  public:
@@ -182,21 +180,21 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     auto M = y->dims()[0];
 
     out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
     xt.mutable_data<T>({4, N}, place);
     yt.mutable_data<T>({4, M}, place);
     std::vector<int> vec_trans = {1, 0};
     F.Transpose(x, &xt, vec_trans);
     F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
     xmin1.Resize({N, 1});
     ymin1.Resize({N, 1});
     xmax1.Resize({N, 1});
@@ -206,12 +204,12 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     xmax2.Resize({1, M});
     ymax2.Resize({1, M});
 
-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
     w1.mutable_data<T>({N, 1}, place);
     h1.mutable_data<T>({N, 1}, place);
     w2.mutable_data<T>({1, M}, place);
@@ -231,10 +229,10 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     F.Mul(&w1, &h1, &area1);
     F.Mul(&w2, &h2, &area2);
 
-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
     inter_xmax.mutable_data<T>({N, M}, place);
     inter_ymax.mutable_data<T>({N, M}, place);
     inter_xmin.mutable_data<T>({N, M}, place);
@@ -244,8 +242,8 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
     F.Maximum(&xmin1, &xmin2, &inter_xmin);
     F.Maximum(&ymin1, &ymin2, &inter_ymin);
 
-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
     inter_w.mutable_data<T>({N, M}, place);
     inter_h.mutable_data<T>({N, M}, place);
     F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -255,14 +253,14 @@ class IouSimilarityMLUKernel : public framework::OpKernel<T> {
       F.Adds(&inter_w, 1.0f, &inter_w);
       F.Adds(&inter_h, 1.0f, &inter_h);
     }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
     zeros.mutable_data<T>({1}, place);
     FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
     F.Maximum(&inter_w, &zeros, &inter_w);
     F.Maximum(&inter_h, &zeros, &inter_h);
 
     F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
     union_area.mutable_data<T>({N, M}, place);
     F.Add(&area1, &area2, &union_area);
     F.Sub(&union_area, out, &union_area);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
index 5708d1ae6460a..7bdd105c37ae0 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct IouFunction {
  public:
@@ -108,21 +106,21 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     auto M = y->dims()[0];
 
     out->mutable_data<T>({N, M}, place);
-    Tensor xt(_type);
-    Tensor yt(_type);
+    phi::DenseTensor xt(_type);
+    phi::DenseTensor yt(_type);
     xt.mutable_data<T>({4, N}, place);
     yt.mutable_data<T>({4, M}, place);
     std::vector<int> vec_trans = {1, 0};
     F.Transpose(x, &xt, vec_trans);
     F.Transpose(y, &yt, vec_trans);
-    Tensor xmin1 = xt.Slice(0, 1);
-    Tensor ymin1 = xt.Slice(1, 2);
-    Tensor xmax1 = xt.Slice(2, 3);
-    Tensor ymax1 = xt.Slice(3, 4);
-    Tensor xmin2 = yt.Slice(0, 1);
-    Tensor ymin2 = yt.Slice(1, 2);
-    Tensor xmax2 = yt.Slice(2, 3);
-    Tensor ymax2 = yt.Slice(3, 4);
+    phi::DenseTensor xmin1 = xt.Slice(0, 1);
+    phi::DenseTensor ymin1 = xt.Slice(1, 2);
+    phi::DenseTensor xmax1 = xt.Slice(2, 3);
+    phi::DenseTensor ymax1 = xt.Slice(3, 4);
+    phi::DenseTensor xmin2 = yt.Slice(0, 1);
+    phi::DenseTensor ymin2 = yt.Slice(1, 2);
+    phi::DenseTensor xmax2 = yt.Slice(2, 3);
+    phi::DenseTensor ymax2 = yt.Slice(3, 4);
     xmin1.Resize({N, 1});
     ymin1.Resize({N, 1});
     xmax1.Resize({N, 1});
@@ -132,12 +130,12 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     xmax2.Resize({1, M});
     ymax2.Resize({1, M});
 
-    Tensor w1(_type);
-    Tensor h1(_type);
-    Tensor w2(_type);
-    Tensor h2(_type);
-    Tensor area1(_type);
-    Tensor area2(_type);
+    phi::DenseTensor w1(_type);
+    phi::DenseTensor h1(_type);
+    phi::DenseTensor w2(_type);
+    phi::DenseTensor h2(_type);
+    phi::DenseTensor area1(_type);
+    phi::DenseTensor area2(_type);
     w1.mutable_data<T>({N, 1}, place);
     h1.mutable_data<T>({N, 1}, place);
     w2.mutable_data<T>({1, M}, place);
@@ -157,10 +155,10 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     F.Mul(&w1, &h1, &area1);
     F.Mul(&w2, &h2, &area2);
 
-    Tensor inter_xmax(_type);
-    Tensor inter_ymax(_type);
-    Tensor inter_xmin(_type);
-    Tensor inter_ymin(_type);
+    phi::DenseTensor inter_xmax(_type);
+    phi::DenseTensor inter_ymax(_type);
+    phi::DenseTensor inter_xmin(_type);
+    phi::DenseTensor inter_ymin(_type);
     inter_xmax.mutable_data<T>({N, M}, place);
     inter_ymax.mutable_data<T>({N, M}, place);
     inter_xmin.mutable_data<T>({N, M}, place);
@@ -170,8 +168,8 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
     F.Maximum(&xmin1, &xmin2, &inter_xmin);
     F.Maximum(&ymin1, &ymin2, &inter_ymin);
 
-    Tensor inter_w(_type);
-    Tensor inter_h(_type);
+    phi::DenseTensor inter_w(_type);
+    phi::DenseTensor inter_h(_type);
     inter_w.mutable_data<T>({N, M}, place);
     inter_h.mutable_data<T>({N, M}, place);
     F.Sub(&inter_xmax, &inter_xmin, &inter_w);
@@ -181,14 +179,14 @@ class IouSimilarityNPUKernel : public framework::OpKernel<T> {
       F.Adds(&inter_w, 1.0f, &inter_w);
       F.Adds(&inter_h, 1.0f, &inter_h);
     }
-    Tensor zeros(_type);
+    phi::DenseTensor zeros(_type);
     zeros.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&zeros, static_cast<T>(0));
     F.Maximum(&inter_w, &zeros, &inter_w);
     F.Maximum(&inter_h, &zeros, &inter_h);
 
     F.Mul(&inter_w, &inter_h, out);
-    Tensor union_area(_type);
+    phi::DenseTensor union_area(_type);
     union_area.mutable_data<T>({N, M}, place);
     F.Add(&area1, &area2, &union_area);
     F.Sub(&union_area, out, &union_area);
diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
index c2b8833bbd96c..1c5135fc4e8a7 100644
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class LocalityAwareNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -252,7 +250,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     int num_det = 0;
 
     int64_t class_num = scores->dims()[0];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
     for (int64_t c = 0; c < class_num; ++c) {
       if (c == background_label) continue;
 
@@ -325,7 +323,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
     const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
     bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
     for (const auto& it : selected_indices) {
@@ -370,7 +368,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
     int64_t box_dim = boxes.dims()[2];
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
     int n = batch_size;
     for (int i = 0; i < n; ++i) {
       scores_slice = scores.Slice(i, i + 1);
@@ -407,7 +405,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           LocalityAwareNMSOutput(dev_ctx,
                                  scores_slice,
                                  boxes_slice,
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 21e52a39c37ab..1beeaf1ba3356 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MatrixNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 5af93551d786f..79077b3086671 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::vector<size_t> GetNmsLodFromRoisNum(
     const phi::DenseTensor* rois_num) {
   std::vector<size_t> rois_lod;
@@ -228,7 +226,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int num_det = 0;
 
     int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
-    Tensor bbox_slice, score_slice;
+    phi::DenseTensor bbox_slice, score_slice;
     for (int64_t c = 0; c < class_num; ++c) {
       if (c == background_label) continue;
       if (scores_size == 3) {
@@ -319,7 +317,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* bboxes_data = bboxes.data<T>();
     auto* odata = outs->data<T>();
     const T* sdata;
-    Tensor bbox;
+    phi::DenseTensor bbox;
     bbox.Resize({scores.dims()[0], box_size});
     int count = 0;
     for (const auto& it : selected_indices) {
@@ -373,7 +371,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t box_dim = boxes->dims()[2];
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
-    Tensor boxes_slice, scores_slice;
+    phi::DenseTensor boxes_slice, scores_slice;
     int n = 0;
     if (has_roisnum) {
       n = score_size == 3 ? batch_size : rois_num->numel();
@@ -449,7 +447,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           if (return_index) {
             int* output_idx =
                 index->mutable_data<int>({num_kept, 1}, ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index e386465c3bdf6..7135853f9ff8b 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index bbeb9f7f2858a..de43f2d62b455 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16
 
diff --git a/paddle/fluid/operators/detection/prior_box_op_npu.cc b/paddle/fluid/operators/detection/prior_box_op_npu.cc
index 8a3a313be159c..42845ff20f129 100644
--- a/paddle/fluid/operators/detection/prior_box_op_npu.cc
+++ b/paddle/fluid/operators/detection/prior_box_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PriorBoxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -50,7 +48,7 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
 
-    Tensor out(input->type());
+    phi::DenseTensor out(input->type());
     auto out_dims = phi::vectorize(boxes->dims());
     out_dims.insert(out_dims.begin(), 2);
     out.Resize(phi::make_ddim(out_dims));
@@ -75,8 +73,8 @@ class PriorBoxNPUKernel : public framework::OpKernel<T> {
     runner.Run(stream);
 
     out.Resize(phi::make_ddim({out.numel()}));
-    Tensor out_boxes = out.Slice(0, boxes->numel());
-    Tensor out_variances = out.Slice(boxes->numel(), out.numel());
+    phi::DenseTensor out_boxes = out.Slice(0, boxes->numel());
+    phi::DenseTensor out_variances = out.Slice(boxes->numel(), out.numel());
 
     out_boxes.Resize(boxes->dims());
     out_variances.Resize(variances->dims());
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index a38765e28d786..d2654e086d08d 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class RetinanetDetectionOutputOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -409,9 +407,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
   }
 
   void RetinanetDetectionOutput(const framework::ExecutionContext& ctx,
-                                const std::vector<Tensor>& scores,
-                                const std::vector<Tensor>& bboxes,
-                                const std::vector<Tensor>& anchors,
+                                const std::vector<phi::DenseTensor>& scores,
+                                const std::vector<phi::DenseTensor>& bboxes,
+                                const std::vector<phi::DenseTensor>& anchors,
                                 const phi::DenseTensor& im_info,
                                 std::vector<std::vector<T>>* nmsed_out,
                                 int* num_nmsed_out) const {
@@ -425,11 +423,11 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     std::map<int, std::vector<std::vector<T>>> preds;
     for (size_t l = 0; l < scores.size(); ++l) {
       // Fetch per level score
-      Tensor scores_per_level = scores[l];
+      phi::DenseTensor scores_per_level = scores[l];
       // Fetch per level bbox
-      Tensor bboxes_per_level = bboxes[l];
+      phi::DenseTensor bboxes_per_level = bboxes[l];
       // Fetch per level anchor
-      Tensor anchors_per_level = anchors[l];
+      phi::DenseTensor anchors_per_level = anchors[l];
 
       int64_t scores_num = scores_per_level.numel();
       int64_t bboxes_num = bboxes_per_level.numel();
@@ -492,9 +490,9 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     auto* im_info = ctx.Input<phi::DenseTensor>("ImInfo");
     auto* outs = ctx.Output<phi::DenseTensor>("Out");
 
-    std::vector<Tensor> boxes_list(boxes.size());
-    std::vector<Tensor> scores_list(scores.size());
-    std::vector<Tensor> anchors_list(anchors.size());
+    std::vector<phi::DenseTensor> boxes_list(boxes.size());
+    std::vector<phi::DenseTensor> scores_list(scores.size());
+    std::vector<phi::DenseTensor> anchors_list(anchors.size());
     for (size_t j = 0; j < boxes_list.size(); ++j) {
       boxes_list[j] = *boxes[j];
       scores_list[j] = *scores[j];
@@ -512,8 +510,8 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
     std::vector<size_t> batch_starts = {0};
     for (int i = 0; i < batch_size; ++i) {
       int num_nmsed_out = 0;
-      std::vector<Tensor> box_per_batch_list(boxes_list.size());
-      std::vector<Tensor> score_per_batch_list(scores_list.size());
+      std::vector<phi::DenseTensor> box_per_batch_list(boxes_list.size());
+      std::vector<phi::DenseTensor> score_per_batch_list(scores_list.size());
       for (size_t j = 0; j < boxes_list.size(); ++j) {
         const auto& score_dims = scores_list[j].dims();
         score_per_batch_list[j] = scores_list[j].Slice(i, i + 1);
@@ -521,7 +519,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
         box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1);
         box_per_batch_list[j].Resize({score_dims[1], box_dim});
       }
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
 
       std::vector<std::vector<T>> nmsed_out;
       RetinanetDetectionOutput(ctx,
@@ -544,7 +542,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
-          Tensor out = outs->Slice(s, e);
+          phi::DenseTensor out = outs->Slice(s, e);
           MultiClassOutput(dev_ctx, all_nmsed_out[i], &out);
         }
       }
@@ -563,7 +561,8 @@ class RetinanetDetectionOutputOpMaker
   void Make() override {
     AddInput("BBoxes",
              "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, 4] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, 4] "
+             "represents the "
              "predicted locations of Mi bounding boxes, N is the batch size. "
              "Mi is the number of bounding boxes from i-th FPN level. Each "
              "bounding box has four coordinate values and the layout is "
@@ -571,18 +570,20 @@ class RetinanetDetectionOutputOpMaker
         .AsDuplicable();
     AddInput("Scores",
              "(List) A list of tensors from multiple FPN levels. Each "
-             "element is a 3-D Tensor with shape [N, Mi, C] represents the "
+             "element is a 3-D phi::DenseTensor with shape [N, Mi, C] "
+             "represents the "
              "predicted confidence from its FPN level. N is the batch size, "
              "C is the class number (excluding background), Mi is the number "
              "of bounding boxes from i-th FPN level. For each bounding box, "
              "there are total C scores.")
         .AsDuplicable();
-    AddInput("Anchors",
-             "(List) A list of tensors from multiple FPN levels. Each"
-             "element is a 2-D Tensor with shape [Mi, 4] represents the "
-             "locations of Mi anchor boxes from i-th FPN level. Each "
-             "bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].")
+    AddInput(
+        "Anchors",
+        "(List) A list of tensors from multiple FPN levels. Each"
+        "element is a 2-D phi::DenseTensor with shape [Mi, 4] represents the "
+        "locations of Mi anchor boxes from i-th FPN level. Each "
+        "bounding box has four coordinate values and the layout is "
+        "[xmin, ymin, xmax, ymax].")
         .AsDuplicable();
     AddInput("ImInfo",
              "(phi::DenseTensor) A 2-D phi::DenseTensor with shape [N, 3] "
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index ff4c1159119e3..9ba51850ebaaa 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 bool GT_E(T a, T b) {
   return (a > b) || fabs(a - b) < 1e-4;
@@ -600,7 +598,7 @@ class ROIPerspectiveTransformOpMaker
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor), "
+             "(phi::DenseTensor), "
              "the input of ROIPerspectiveTransformOp. "
              "The format of input tensor is NCHW. Where N is batch size, "
              "C is the number of input channels, "
@@ -617,28 +615,28 @@ class ROIPerspectiveTransformOpMaker
              "(x4, y4) is the bottom left coordinates.");
     AddOutput(
         "Out",
-        "(Tensor), "
+        "(phi::DenseTensor), "
         "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
         "(num_rois, channels, transformed_h, transformed_w).");
     AddOutput("Mask",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "The output mask of ROIPerspectiveTransformOp is a 4-D tensor "
               "with shape "
               "(num_rois, 1, transformed_h, transformed_w).");
     AddOutput("TransformMatrix",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "The output transform matrix of ROIPerspectiveTransformOp is a "
               "1-D tensor with shape "
               "(num_rois, 9).");
     AddOutput("Out2InIdx",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "An intermediate tensor used to map indexes of input feature map "
               "and indexes of output feature map."
               "The shape of the tensor is [out_size, 4] and out_size is the "
               "number of elements in output feature map.")
         .AsIntermediate();
     AddOutput("Out2InWeights",
-              "(Tensor), "
+              "(phi::DenseTensor), "
               "An intermediate tensor used to record the weights of bilinear "
               "interpolatein for each element in output. The shape of the "
               "tensor is [out_size, 4] and out_size is the number of elements "
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index f73ddf9a09e6e..ba7fe51383822 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -113,11 +112,12 @@ void AppendRpns(phi::DenseTensor* out,
 }
 
 template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
-                                         const phi::DenseTensor* anchor,
-                                         const float rpn_straddle_thresh,
-                                         T im_height,
-                                         T im_width) {
+std::vector<phi::DenseTensor> FilterStraddleAnchor(
+    const phi::CPUContext& context,
+    const phi::DenseTensor* anchor,
+    const float rpn_straddle_thresh,
+    T im_height,
+    T im_width) {
   std::vector<int> inds_inside;
   int anchor_num = anchor->dims()[0];
   auto* anchor_data = anchor->data<T>();
@@ -138,25 +138,25 @@ std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
     }
   }
   int inside_num = inds_inside.size();
-  Tensor inds_inside_t;
+  phi::DenseTensor inds_inside_t;
   int* inds_inside_data =
       inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
   std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
-  Tensor inside_anchor_t;
+  phi::DenseTensor inside_anchor_t;
   T* inside_anchor_data =
       inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
   Gather<T>(
       anchor->data<T>(), 4, inds_inside_data, inside_num, inside_anchor_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(inds_inside_t);
   res.emplace_back(inside_anchor_t);
   return res;
 }
 
 template <typename T>
-Tensor FilterCrowdGt(const phi::CPUContext& context,
-                     phi::DenseTensor* gt_boxes,
-                     phi::DenseTensor* is_crowd) {
+phi::DenseTensor FilterCrowdGt(const phi::CPUContext& context,
+                               phi::DenseTensor* gt_boxes,
+                               phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -166,7 +166,7 @@ Tensor FilterCrowdGt(const phi::CPUContext& context,
     }
   }
   int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes;
+  phi::DenseTensor ncrowd_gt_boxes;
   T* ncrowd_gt_boxes_data =
       ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
   Gather<T>(gt_boxes->data<T>(),
@@ -300,7 +300,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }
 
 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(
+std::vector<phi::DenseTensor> SampleRpnFgBgGt(
     const phi::CPUContext& ctx,
     const phi::DenseTensor& anchor_by_gt_overlap,
     const int rpn_batch_size_per_im,
@@ -322,7 +322,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
   anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
   int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
   gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -365,7 +365,8 @@ std::vector<Tensor> SampleRpnFgBgGt(
   for (int i = 0; i < fg_fake_num; ++i) {
     gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
   int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -381,7 +382,7 @@ std::vector<Tensor> SampleRpnFgBgGt(
   std::copy(bbox_inside_weight.begin(),
             bbox_inside_weight.end(),
             bbox_inside_weight_data);
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -455,30 +456,30 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
     auto gt_boxes_lod = gt_boxes->lod().back();
     auto is_crowd_lod = is_crowd->lod().back();
     for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
       auto* im_info_data = im_info_slice.data<T>();
       auto im_height = im_info_data[0];
       auto im_width = im_info_data[1];
       auto im_scale = im_info_data[2];
 
       // Filter straddle anchor
-      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+      std::vector<phi::DenseTensor> filter_output = FilterStraddleAnchor<T>(
           dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];
 
       // Filter crowd gt
-      Tensor ncrowd_gt_boxes =
+      phi::DenseTensor ncrowd_gt_boxes =
           FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
       auto ncrowd_gt_boxes_et =
           framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
       ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
 
-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
       anchor_by_gt_overlap.mutable_data<T>(
           {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
       BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -492,16 +493,16 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                                                     engine,
                                                     use_random);
 
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
       // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
       sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
       sampled_score_index_unmap.mutable_data<int>({score_num}, place);
       Gather<int>(inds_inside.data<int>(),
@@ -516,7 +517,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                   sampled_score_index_unmap.data<int>());
 
       // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
       auto* sampled_anchor_data =
           sampled_anchor.mutable_data<T>({loc_num, 4}, place);
       auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
@@ -859,10 +860,11 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
-                                          phi::DenseTensor* gt_boxes,
-                                          phi::DenseTensor* gt_labels,
-                                          phi::DenseTensor* is_crowd) {
+std::vector<phi::DenseTensor> FilterCrowdGtBoxLabel(
+    const phi::CPUContext& context,
+    phi::DenseTensor* gt_boxes,
+    phi::DenseTensor* gt_labels,
+    phi::DenseTensor* is_crowd) {
   int gt_num = gt_boxes->dims()[0];
   std::vector<int> not_crowd_inds;
   auto* is_crowd_data = is_crowd->data<int>();
@@ -872,7 +874,7 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
     }
   }
   int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes, ncrowd_gt_labels;
+  phi::DenseTensor ncrowd_gt_boxes, ncrowd_gt_labels;
   T* ncrowd_gt_boxes_data =
       ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
   int* ncrowd_gt_labels_data =
@@ -887,19 +889,20 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
               not_crowd_inds.data(),
               ncrowd_num,
               ncrowd_gt_labels_data);
-  std::vector<Tensor> res;
+  std::vector<phi::DenseTensor> res;
   res.emplace_back(ncrowd_gt_boxes);
   res.emplace_back(ncrowd_gt_labels);
   return res;
 }
 
 template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
-                                 const phi::DenseTensor& anchor_by_gt_overlap,
-                                 const phi::DenseTensor& ncrowd_gt_labels,
-                                 const float positive_overlap,
-                                 const float negative_overlap,
-                                 std::minstd_rand engine) {
+std::vector<phi::DenseTensor> GetAllFgBgGt(
+    const phi::CPUContext& ctx,
+    const phi::DenseTensor& anchor_by_gt_overlap,
+    const phi::DenseTensor& ncrowd_gt_labels,
+    const float positive_overlap,
+    const float negative_overlap,
+    std::minstd_rand engine) {
   auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
   int anchor_num = anchor_by_gt_overlap.dims()[0];
   int gt_num = anchor_by_gt_overlap.dims()[1];
@@ -913,7 +916,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
   // Calculate the max IoU between anchors and gt boxes
   // Map from anchor to gt box that has highest overlap
   auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  phi::DenseTensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
   anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
   int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
   gt_to_anchor_max.mutable_data<T>({gt_num}, place);
@@ -961,8 +964,9 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
     gt_inds.emplace_back(argmax[fg_fake[i]]);
   }
 
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  Tensor fg_num_t;
+  phi::DenseTensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t,
+      bbox_inside_weight_t;
+  phi::DenseTensor fg_num_t;
   int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
   int* score_index_data =
       score_index_t.mutable_data<int>({fg_num + bg_num}, place);
@@ -980,7 +984,7 @@ std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
             bbox_inside_weight.end(),
             bbox_inside_weight_data);
   fg_num_data[0] = fg_fake.size() + 1;
-  std::vector<Tensor> loc_score_tgtlbl_gt;
+  std::vector<phi::DenseTensor> loc_score_tgtlbl_gt;
   loc_score_tgtlbl_gt.emplace_back(loc_index_t);
   loc_score_tgtlbl_gt.emplace_back(score_index_t);
   loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
@@ -1065,35 +1069,35 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
     auto gt_labels_lod = gt_labels->lod().back();
     auto is_crowd_lod = is_crowd->lod().back();
     for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
+      phi::DenseTensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor gt_labels_slice =
+      phi::DenseTensor gt_labels_slice =
           gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]);
-      Tensor is_crowd_slice =
+      phi::DenseTensor is_crowd_slice =
           is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      phi::DenseTensor im_info_slice = im_info->Slice(i, i + 1);
       auto* im_info_data = im_info_slice.data<T>();
       auto im_height = im_info_data[0];
       auto im_width = im_info_data[1];
       auto im_scale = im_info_data[2];
 
       // Filter straddle anchor
-      std::vector<Tensor> filter_output =
+      std::vector<phi::DenseTensor> filter_output =
           FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
+      phi::DenseTensor inds_inside = filter_output[0];
+      phi::DenseTensor inside_anchor = filter_output[1];
 
       // Filter crowd gt
-      std::vector<Tensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
+      std::vector<phi::DenseTensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
           dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      Tensor ncrowd_gt_boxes = ncrowd_output[0];
-      Tensor ncrowd_gt_labels = ncrowd_output[1];
+      phi::DenseTensor ncrowd_gt_boxes = ncrowd_output[0];
+      phi::DenseTensor ncrowd_gt_labels = ncrowd_output[1];
 
       auto ncrowd_gt_boxes_et =
           framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
       ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
 
-      Tensor anchor_by_gt_overlap;
+      phi::DenseTensor anchor_by_gt_overlap;
       anchor_by_gt_overlap.mutable_data<T>(
           {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
       BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
@@ -1105,17 +1109,17 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                                                  negative_overlap,
                                                  engine);
 
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      Tensor sampled_fg_num = loc_score_tgtlbl_gt[5];
+      phi::DenseTensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      phi::DenseTensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      phi::DenseTensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      phi::DenseTensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+      phi::DenseTensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
+      phi::DenseTensor sampled_fg_num = loc_score_tgtlbl_gt[5];
 
       int loc_num = sampled_loc_index.dims()[0];
       int score_num = sampled_score_index.dims()[0];
       // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      phi::DenseTensor sampled_loc_index_unmap, sampled_score_index_unmap;
       sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
       sampled_score_index_unmap.mutable_data<int>({score_num}, place);
       Gather<int>(inds_inside.data<int>(),
@@ -1130,7 +1134,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
                   sampled_score_index_unmap.data<int>());
 
       // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      phi::DenseTensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
       auto* sampled_anchor_data =
           sampled_anchor.mutable_data<T>({loc_num, 4}, place);
       auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 56d28c20dc8e7..6ff2e9c65d856 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
@@ -123,10 +121,10 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto x_dims = X->dims();
@@ -154,12 +152,13 @@ template <typename DeviceContext, typename T>
 class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
index b7c77a5e28222..0632e5ab8fab0 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h
@@ -22,16 +22,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SigmoidFocalLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    Tensor *Out = context.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    phi::DenseTensor *Out = context.Output<phi::DenseTensor>("Out");
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
     auto out_data = Out->mutable_data<T>(context.GetPlace());
@@ -79,12 +77,13 @@ template <typename DeviceContext, typename T>
 class SigmoidFocalLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const Tensor *X = context.Input<phi::DenseTensor>("X");
-    const Tensor *Labels = context.Input<phi::DenseTensor>("Label");
-    const Tensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
-    const Tensor *dOut =
+    const phi::DenseTensor *X = context.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *Labels = context.Input<phi::DenseTensor>("Label");
+    const phi::DenseTensor *FgNum = context.Input<phi::DenseTensor>("FgNum");
+    const phi::DenseTensor *dOut =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *dX = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *dX =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto dx_data = dX->mutable_data<T>(context.GetPlace());
     T gamma = static_cast<T>(context.Attr<float>("gamma"));
     T alpha = static_cast<T>(context.Attr<float>("alpha"));
diff --git a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
index 739c05805d68a..aac3369381e95 100644
--- a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
@@ -77,7 +77,7 @@ class YoloBoxMLUKernel : public framework::OpKernel<T> {
     MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
     MLUOpTensorDesc img_size_desc(
         *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
-    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
     anchors_temp.Resize({size});
     paddle::framework::TensorFromVector(
         anchors, ctx.device_context(), &anchors_temp);
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 5d3cccb3a6617..ada4d18eb00c1 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class DetectionMAPOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index d3f55edd8840f..8e362957e46e8 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index 7cf98738d073f..0d0686026da4b 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class DropoutMLUKernel : public framework::OpKernel<T> {
  public:
@@ -106,8 +104,8 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
     }
 
     // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
-    Tensor scale_tensor(x->dtype());
-    Tensor bias_tensor(x->dtype());
+    phi::DenseTensor scale_tensor(x->dtype());
+    phi::DenseTensor bias_tensor(x->dtype());
     scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
     bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
     MLUCnnlTensorDesc scale_desc(scale_tensor);
@@ -157,7 +155,7 @@ class DropoutGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast mask from uint8 to float32/float16
-    Tensor cast_mask(grad_x->dtype());
+    phi::DenseTensor cast_mask(grad_x->dtype());
     cast_mask.Resize(mask->dims());
     cast_mask.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index a63b6e5e479af..72453bedee399 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class DropoutNPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,8 +54,8 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
 
     // only achieve the default `upscale_in_train` method
     if (!is_test) {
-      Tensor tmp_x(x->dtype());
-      Tensor tmp_out(out->dtype());
+      phi::DenseTensor tmp_x(x->dtype());
+      phi::DenseTensor tmp_out(out->dtype());
       tmp_x.ShareDataWith(*x);
       tmp_out.ShareDataWith(*out);
       if (x->dims().size() == 1) {
@@ -80,7 +78,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
         seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
       }
 
-      Tensor keep_prob_tensor(x->dtype());
+      phi::DenseTensor keep_prob_tensor(x->dtype());
       keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&keep_prob_tensor,
                                    static_cast<T>(keep_prob));
@@ -89,14 +87,14 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
 
       // mask used in `DropOutGenMask` NPU OP is different from
       // the output `Mask`.
-      Tensor npu_mask(experimental::DataType::UINT8);
+      phi::DenseTensor npu_mask(experimental::DataType::UINT8);
       uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
       npu_mask.Resize(phi::make_ddim({length / 8}));
       npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
 
       // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
       // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` Tensor of this OP is forced to be set to 0
+      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
       // in `npu_op_runner.cc`, which needs to be optimized later.
       NpuOpRunner runner_gen_mask;
       runner_gen_mask.SetType("DropOutGenMask")
@@ -116,7 +114,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       runner_dropout.Run(stream);
 
       // cast `out` from float/float16 to bool
-      Tensor cast_mask(experimental::DataType::BOOL);
+      phi::DenseTensor cast_mask(experimental::DataType::BOOL);
       cast_mask.Resize(mask->dims());
       cast_mask.mutable_data<bool>(ctx.GetPlace());
       auto dst_dtype_bool =
@@ -176,7 +174,7 @@ class DropoutGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast mask from uint8 to float32/float16
-    Tensor cast_mask(dx->dtype());
+    phi::DenseTensor cast_mask(dx->dtype());
     cast_mask.Resize(mask->dims());
     cast_mask.mutable_data<T>(ctx.GetPlace());
     auto dst_dtype =
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
index 456a11f95aaca..7c6cd94782a9c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 17a1736c0871b..7b6683255ea93 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
@@ -53,7 +52,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -96,7 +95,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
           }
         }
         if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
           tmp.ShareDataWith(*dx);
           tmp.Resize(phi::make_ddim(dst_dims_vec));
           const auto& runner =
@@ -128,7 +127,7 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
           }
         }
         if (!reduce_axes.empty()) {
-          Tensor tmp;
+          phi::DenseTensor tmp;
           tmp.ShareDataWith(*dy);
           tmp.Resize(phi::make_ddim(dst_dims_vec));
           const auto& runner =
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 236b40c122204..8c7aa350b4372 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -24,7 +24,6 @@ namespace operators {
 class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
index 27f7281b9fb1e..d3e955cd2fe32 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_mlu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
  public:
@@ -66,7 +64,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
         CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
     // compute dout/y == 1/y * dout
-    Tensor dout_div_y(dout->dtype());
+    phi::DenseTensor dout_div_y(dout->dtype());
     dout_div_y.Resize(dout->dims());
     dout_div_y.mutable_data<T>(ctx.GetPlace());
     MLUBinary<DIV>(ctx,
@@ -110,7 +108,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
 
     if (dy) {
       // compute dy = -out * (dout/y) = -out/y * dout
-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
       neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
 
       MLUCnnlTensorDesc out_desc(*out);
@@ -121,7 +119,7 @@ class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
                     out_desc.get(),
                     GetBasePtr(&neg_out));
 
-      Tensor dy_temp(y->dtype());
+      phi::DenseTensor dy_temp(y->dtype());
       dy_temp.Resize(dout->dims());
       dy_temp.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
index 74a2a5b6ca6eb..6cc37517d4fbe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
  public:
@@ -66,38 +64,38 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
     if (dx) {
       dx->mutable_data<T>(place);
 
-      Tensor tensor_one(y->type());
+      phi::DenseTensor tensor_one(y->type());
       tensor_one.mutable_data<float>({1}, place);
       FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));
 
       // Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
       // Because `Power` will cause precision overflow, that is, `float_status`
       // will be set to 1.
-      Tensor y_div(y->type());
+      phi::DenseTensor y_div(y->type());
       y_div.mutable_data<T>(y->dims(), place);
       const auto& runner_one_div_y =
           NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
       runner_one_div_y.Run(stream);
 
-      Tensor tensor_zeros(x->type());
+      phi::DenseTensor tensor_zeros(x->type());
       tensor_zeros.mutable_data<T>(x->dims(), place);
       const auto& runner_tensor_zeros =
           NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
       runner_tensor_zeros.Run(stream);
 
-      Tensor x_zero(experimental::DataType::BOOL);
+      phi::DenseTensor x_zero(experimental::DataType::BOOL);
       x_zero.mutable_data<bool>(x->dims(), place);
       const auto& runner_x_zero =
           NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
       runner_x_zero.Run(stream);
 
-      Tensor x_nozero(experimental::DataType::BOOL);
+      phi::DenseTensor x_nozero(experimental::DataType::BOOL);
       x_nozero.mutable_data<bool>(x->dims(), place);
       const auto& runner_x_nonzero =
           NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
       runner_x_nonzero.Run(stream);
 
-      Tensor x_nozero_f(x->type());
+      phi::DenseTensor x_nozero_f(x->type());
       x_nozero_f.mutable_data<T>(x->dims(), place);
       const auto& runner_x_nonzero_f =
           NpuOpRunner("Cast",
@@ -106,7 +104,7 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
                       {{"dst_type", static_cast<int32_t>(0)}});
       runner_x_nonzero_f.Run(stream);
 
-      Tensor x_grad_w(x->type());
+      phi::DenseTensor x_grad_w(x->type());
       x_grad_w.mutable_data<T>(x->dims(), place);
       const auto& runner_x_grad_w =
           NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
@@ -120,19 +118,19 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
     if (dy) {
       dy->mutable_data<T>(place);
 
-      Tensor neg_out(out->type());
+      phi::DenseTensor neg_out(out->type());
       neg_out.mutable_data<T>(out->dims(), place);
       const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
       runner_neg_out.Run(stream);
 
-      Tensor tmp_mul(out->type());
+      phi::DenseTensor tmp_mul(out->type());
       tmp_mul.mutable_data<T>(out->dims(), place);
       const auto& runner_mul =
           NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
       runner_mul.Run(stream);
 
       if (dy->dims() != dout->dims()) {
-        Tensor reduced_tmp_mul(y->type());
+        phi::DenseTensor reduced_tmp_mul(y->type());
         reduced_tmp_mul.mutable_data<T>(y->dims(), place);
 
         std::vector<int64_t> axes;
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
index 396f1b6f6223a..5f1b84112b2f9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
index fe91c28cd1f05..14bfbfb693b06 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -51,7 +49,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -85,7 +83,7 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
     auto x_dims = x->dims();
     auto y_dims = y->dims();
     axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     NpuElementWiseOpBroadcast<T>(
         dev_ctx, x, y, axis, &transformed_x, &transformed_y);
 
@@ -99,9 +97,9 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
     if (dx && dy) {
       dx->mutable_data<T>(ctx.GetPlace());
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
@@ -153,12 +151,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
       }
 
     } else if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
@@ -190,12 +188,12 @@ class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
       }
 
     } else if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
 
       const auto& runner = NpuOpRunner("MaximumGrad",
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
index 861ed2046c077..43b25b5127c8b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
index 8014f82ca5742..86c37e0c89020 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
  public:
@@ -48,7 +46,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
     } else {
       direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
     }
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     if (direct_compute) {
       transformed_x.ShareDataWith(*x);
       transformed_y.ShareDataWith(*y);
@@ -82,7 +80,7 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
     if (dx && dy) {
       // dx
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.ShareDataWith(*dx);
       if (dx->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_x;
@@ -105,7 +103,7 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       }
       // dy
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_y;
+      phi::DenseTensor tmp_y;
       tmp_y.ShareDataWith(*dy);
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_y;
@@ -134,12 +132,12 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
 
     } else if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
       // dx
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.ShareDataWith(*dx);
       if (dx->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_x;
@@ -168,13 +166,13 @@ class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
       runner.Run(stream);
 
     } else if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       // dy
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor tmp_y;
+      phi::DenseTensor tmp_y;
       tmp_y.ShareDataWith(*dy);
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec_y;
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 57f4b0c057686..9a33d5a26ad54 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -309,7 +309,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
   // mask = Logic(x, y) only support min & max
   cnnlLogicOp_t logic =
       Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
-  Tensor mask(x->dtype());
+  phi::DenseTensor mask(x->dtype());
   mask.Resize(phi::make_ddim(out_dims_array));
   mask.mutable_data<Tin>(ctx.GetPlace());
 
@@ -327,7 +327,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
                  GetBasePtr(&mask));
 
   // dx = Mul(dz, mask)
-  Tensor dx_temp(x->dtype());
+  phi::DenseTensor dx_temp(x->dtype());
   dx_temp.Resize(dout->dims());
   dx_temp.mutable_data<Tout>(ctx.GetPlace());
   MLUCnnlTensorDesc dout_desc(*dout);
@@ -344,7 +344,7 @@ void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
                     data_type);
 
   // dy = Sub(dz, dx)
-  Tensor dy_temp(y->dtype());
+  phi::DenseTensor dy_temp(y->dtype());
   dy_temp.Resize(dout->dims());
   dy_temp.mutable_data<Tout>(ctx.GetPlace());
   MLUCnnlOpTensorDesc sub_op_desc(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
index bdeef48389b6c..f73fbba0fb496 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwiseModNPUKernel : public framework::OpKernel<T> {
  public:
@@ -43,7 +41,7 @@ class ElementwiseModNPUKernel : public framework::OpKernel<T> {
       direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
     }
 
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     if (direct_compute) {
       transformed_x.ShareDataWith(*x);
       transformed_y.ShareDataWith(*y);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index c7b872af75a44..5aa1b7ed4f1dd 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -25,7 +25,6 @@ namespace operators {
 
 class ElementwiseMulOp : public ElementwiseOp {
  public:
-  using Tensor = phi::DenseTensor;
   using ElementwiseOp::ElementwiseOp;
 
   framework::OpKernelType GetExpectedKernelType(
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
index fe2848621c76f..c5f8a0ad711a6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using MLUDeviceContext = platform::MLUDeviceContext;
 
 template <typename T>
@@ -73,7 +72,7 @@ class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
                           GetBasePtr(dx),
                           ToCnnlDataType<T>());
       } else {
-        Tensor dx_temp(x->dtype());
+        phi::DenseTensor dx_temp(x->dtype());
         dx_temp.Resize(dout->dims());
         dx_temp.mutable_data<T>(ctx.GetPlace());
         MLUCnnl::OpTensor(ctx,
@@ -121,7 +120,7 @@ class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
                           GetBasePtr(dy),
                           ToCnnlDataType<T>());
       } else {
-        Tensor dy_temp(y->dtype());
+        phi::DenseTensor dy_temp(y->dtype());
         dy_temp.Resize(dout->dims());
         dy_temp.mutable_data<T>(ctx.GetPlace());
         MLUCnnl::OpTensor(ctx,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index 4fc3be1b29cc7..d9bf2adeee72c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -78,7 +77,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor trans_x, trans_y;
+      phi::DenseTensor trans_x, trans_y;
       NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
       const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
       runner.Run(stream);
@@ -101,7 +100,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
     axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    Tensor trans_x, trans_y;
+    phi::DenseTensor trans_x, trans_y;
     NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
 
     if (dx) {
@@ -110,7 +109,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
         const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
         runner_dx.Run(stream);
       } else {
-        Tensor dx_temp(x->type());
+        phi::DenseTensor dx_temp(x->type());
         dx_temp.Resize(trans_x.dims());
         dx_temp.mutable_data<T>(ctx.GetPlace());
         const auto& runner_dx =
@@ -126,7 +125,7 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
         const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
         runner_dy.Run(stream);
       } else {
-        Tensor dy_temp(y->type());
+        phi::DenseTensor dy_temp(y->type());
         dy_temp.Resize(trans_y.dims());
         dy_temp.mutable_data<T>(ctx.GetPlace());
         const auto& runner_dy =
diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h
index b7e85c45f4c7c..d8ee104c66b99 100644
--- a/paddle/fluid/operators/elementwise/elementwise_npu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
@@ -32,12 +31,12 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
 
   // 1. expand the axis with dim 1
   auto src_dims = src->dims();
-  Tensor tmp_src;
+  phi::DenseTensor tmp_src;
   tmp_src.ShareDataWith(*src);
   tmp_src.Resize(src_dims);
   for (int i = 0; i < src_dims.size(); ++i) {
     if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
-      Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       auto tmp_tensor_dims = tmp_src.dims();
       tmp_tensor_dims[i] = dst_dims[i + axis];
       tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
@@ -56,7 +55,7 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
   // 2.expand the ahead axis
   auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis));
   if (prev > 1) {
-    Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size());
     tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
     const auto& runner =
@@ -79,7 +78,7 @@ void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
     src_dims_vec.push_back(1);
     tmp_src.Resize(phi::make_ddim(src_dims_vec));
 
-    Tensor tmp_tensor;
+    phi::DenseTensor tmp_tensor;
     tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
     const auto& runner =
         NpuOpRunner("TileWithAxis",
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 6bc9c345fcd4e..1ed8f4eb012a2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -36,8 +36,6 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  using Tensor = phi::DenseTensor;
-
   void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ElementwiseOp");
     OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ElementwiseOp");
@@ -282,7 +280,6 @@ For example:
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto out_grad_name = framework::GradVarName("Out");
@@ -330,7 +327,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     auto x_grad_name = framework::GradVarName("X");
@@ -376,7 +372,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
     : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("DDOut")) {
@@ -427,7 +422,6 @@ class ElementwiseOpDoubleGradWithoutDXDY
 class ElementwiseOpTripleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = phi::DenseTensor;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     if (ctx->HasOutput("D_DDX")) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
index 6942377049b47..77d1160e4ce16 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwisePowMLUKernel : public framework::OpKernel<T> {
  public:
@@ -64,11 +62,11 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
     auto dout_dims = dout->dims();
     if (dx) {
       // dx = dout * y * pow(x, y - 1);
-      Tensor one_dx(y->type());
+      phi::DenseTensor one_dx(y->type());
       one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
 
-      Tensor sub_dx(y->type());
+      phi::DenseTensor sub_dx(y->type());
       sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
       MLUCnnlOpTensorDesc op_tensor_desc(
           CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
@@ -82,7 +80,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
                         GetBasePtr(&sub_dx),
                         data_type);
 
-      Tensor tmp_dx(x->type());
+      phi::DenseTensor tmp_dx(x->type());
       tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
       MLUCnnl::Pow(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
@@ -134,7 +132,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
     }
     if (dy) {
       // dy = dout * log(x) * pow(x, y)
-      Tensor tmp_dy(y->type());
+      phi::DenseTensor tmp_dy(y->type());
       tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
       MLUCnnl::Pow(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
@@ -145,7 +143,7 @@ class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
                    out_desc.get(),
                    GetBasePtr(&tmp_dy));
 
-      Tensor log_x(x->type());
+      phi::DenseTensor log_x(x->type());
       log_x.mutable_data<T>(x->dims(), place);
       MLUCnnl::Log(ctx,
                    CNNL_COMPUTATION_HIGH_PRECISION,
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
index 18853222ba6b7..b0b1b37c4f78d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ElementwisePowNPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +54,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
       const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
       runner.Run(stream);
     } else {
-      Tensor transformed_x, transformed_y;
+      phi::DenseTensor transformed_x, transformed_y;
       NpuElementWiseOpBroadcast<T>(
           dev_ctx, x, y, axis, &transformed_x, &transformed_y);
       const auto& runner =
@@ -84,7 +82,7 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
     auto y_dims = y->dims();
     axis =
         (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
-    Tensor transformed_x, transformed_y;
+    phi::DenseTensor transformed_x, transformed_y;
     NpuElementWiseOpBroadcast<T>(
         dev_ctx, x, y, axis, &transformed_x, &transformed_y);
 
@@ -93,34 +91,34 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
     // Reshape info vector.
     std::vector<int> reduce_axes;
     if (dx) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, place);
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dx->mutable_data<T>(place);
-      Tensor tmp_dx;
+      phi::DenseTensor tmp_dx;
       tmp_dx.mutable_data<T>(dout_dims, place);
 
       // dx = dout * y * pow(x, y - 1);
-      Tensor PowGrad_dx_temp1(dout->type());
+      phi::DenseTensor PowGrad_dx_temp1(dout->type());
       PowGrad_dx_temp1.mutable_data<T>(dout->dims(), place);
       const auto& runner_PowGrad_dx_temp1 =
           NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {});
       runner_PowGrad_dx_temp1.Run(stream);
 
-      Tensor one_dx(transformed_y.type());
+      phi::DenseTensor one_dx(transformed_y.type());
       one_dx.mutable_data<T>(transformed_y.dims(), place);
       const auto& runner_one_dx =
           NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {});
       runner_one_dx.Run(stream);
 
-      Tensor sub_dx(transformed_y.type());
+      phi::DenseTensor sub_dx(transformed_y.type());
       sub_dx.mutable_data<T>(transformed_y.dims(), place);
       const auto& runner_sub_dx =
           NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {});
       runner_sub_dx.Run(stream);
 
-      Tensor PowGrad_dx_temp2(transformed_x.type());
+      phi::DenseTensor PowGrad_dx_temp2(transformed_x.type());
       PowGrad_dx_temp2.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dx_temp2 =
           NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {});
@@ -153,39 +151,39 @@ class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
       }
     }
     if (dy) {
-      Tensor zero_tensor(dout->type());
+      phi::DenseTensor zero_tensor(dout->type());
       zero_tensor.mutable_data<T>(dout_dims, place);
       FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
 
       dy->mutable_data<T>(place);
-      Tensor tmp_dy;
+      phi::DenseTensor tmp_dy;
       tmp_dy.mutable_data<T>(dout_dims, place);
 
       // dy = dout * log(x) * pow(x, y)
-      Tensor PowGrad_dy_temp1(transformed_x.type());
+      phi::DenseTensor PowGrad_dy_temp1(transformed_x.type());
       PowGrad_dy_temp1.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dy_temp1 = NpuOpRunner(
           "Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {});
       runner_PowGrad_dy_temp1.Run(stream);
 
-      Tensor one_dy(transformed_x.type());
+      phi::DenseTensor one_dy(transformed_x.type());
       one_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_one_dy =
           NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {});
       runner_one_dy.Run(stream);
 
-      Tensor sub_dy(transformed_x.type());
+      phi::DenseTensor sub_dy(transformed_x.type());
       sub_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_sub_dy =
           NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {});
       runner_sub_dy.Run(stream);
 
-      Tensor log_dy(transformed_x.type());
+      phi::DenseTensor log_dy(transformed_x.type());
       log_dy.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {});
       runner_log_dy.Run(stream);
 
-      Tensor PowGrad_dy_temp2(transformed_x.type());
+      phi::DenseTensor PowGrad_dy_temp2(transformed_x.type());
       PowGrad_dy_temp2.mutable_data<T>(transformed_x.dims(), place);
       const auto& runner_PowGrad_dy_temp2 = NpuOpRunner(
           "Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {});
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
index 0f56044d268e4..1233ae2d0ae0c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 8df295a972559..9f70961c9f620 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
  public:
@@ -76,7 +74,7 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         axes.push_back(i);
       }
       phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      Tensor reduced_dout(dx->type());
+      phi::DenseTensor reduced_dout(dx->type());
       if (axes.size() != 0) {
         std::vector<int64_t> reduced_dout_dims;
         for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
@@ -124,8 +122,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         axes.push_back(i);
       }
       phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
-      Tensor reduced_dy(dy->type());
-      Tensor reduced_dout(dy->type());
+      phi::DenseTensor reduced_dy(dy->type());
+      phi::DenseTensor reduced_dout(dy->type());
 
       if (axes.size() != 0) {
         std::vector<int64_t> reduced_dout_dims;
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index 58b6b619c231a..a3462a00bcfb1 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index 1205fc0447f1e..2c62dc570ff21 100644
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_as_v2_op_mlu.cc b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
index 8184af44916bb..71b154ff02274 100644
--- a/paddle/fluid/operators/expand_as_v2_op_mlu.cc
+++ b/paddle/fluid/operators/expand_as_v2_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 35d16311a97b3..6d6739eed6702 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -85,7 +85,6 @@ inline std::vector<int> get_expand_times(
   }
 }
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
index d5748328b1d4d..95a4147c88dbd 100644
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ExpandV2NPUKernel : public framework::OpKernel<T> {
  public:
@@ -121,8 +120,8 @@ class ExpandV2NPUKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<Tensor>& inputs,
-                      const std::vector<Tensor>& outputs,
+    auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                      const std::vector<phi::DenseTensor>& outputs,
                       const NPUAttributeMap& attrs,
                       const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
@@ -174,8 +173,8 @@ class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
       axes.push_back(i);
     }
 
-    Tensor tmp_dout(dout->dtype());
-    Tensor reduced_dout(dx->dtype());
+    phi::DenseTensor tmp_dout(dout->dtype());
+    phi::DenseTensor reduced_dout(dx->dtype());
     tmp_dout.ShareDataWith(*dout);
     if (axes.size() != 0) {
       std::vector<int64_t> reduced_dout_dims;
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
index 6a01992c83335..74bbc531c27e3 100644
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class EyeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 433288e885d01..025c73db8c375 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 inline void FCOutputSize(const framework::DDim& in_dims,
                          const framework::DDim& w_dims,
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 22df3e5a9d23a..a3ea1af82ee4d 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
  public:
@@ -80,7 +78,7 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     } else {
       out->mutable_data(ctx.GetPlace(),
                         framework::TransToPhiDataType(data_type));
-      Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
+      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
       tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
diff --git a/paddle/fluid/operators/fill_constant_op_mlu.cc b/paddle/fluid/operators/fill_constant_op_mlu.cc
index 664d70609e939..8263534f4eeeb 100644
--- a/paddle/fluid/operators/fill_constant_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_op_mlu.cc
@@ -60,7 +60,8 @@ class FillConstantMLUKernel : public framework::OpKernel<T> {
           value_tensor->numel(),
           1,
           platform::errors::InvalidArgument(
-              "When use Tensor as value to set Tensor value in fill_cosntant, "
+              "When use phi::DenseTensor as value to set phi::DenseTensor "
+              "value in fill_cosntant, "
               "value input(ValueTensor) size must be 1, but get %d",
               value_tensor->numel()));
       value_data = value_tensor->data<T>();
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
index 56068684e16ce..c07a69177b832 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ b/paddle/fluid/operators/filter_by_instag_op.cu
@@ -43,7 +43,6 @@ namespace cg = cooperative_groups;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
index 04f1099168a5c..95e6611d9351f 100644
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ b/paddle/fluid/operators/filter_by_instag_op.h
@@ -29,7 +29,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 65d3f809fa11c..81af8e64f2767 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FlattenOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
index 177825020d0dc..6c8f986c5e5df 100644
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ b/paddle/fluid/operators/flatten_op_npu.cc
@@ -55,8 +55,6 @@ class Flatten2GradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index 0f8072520be2f..c5b903559a07b 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FSPOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index c8ea19d463a1b..c0157c8cb04dd 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
index cdbd5b2e0b821..e26273b745260 100644
--- a/paddle/fluid/operators/fused/attn_gemm_int8.h
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::backends::gpu::GpuLaunchConfig;
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 87ed8fb68fe2a..dee0c1837a452 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -27,7 +27,6 @@ namespace paddle {
 namespace operators {
 
 #if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
@@ -77,8 +76,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     const std::string padding_algorithm =
         ctx.Attr<std::string>("padding_algorithm");
 
-    Tensor transformed_input_channel(input->dtype());
-    Tensor transformed_output(output->dtype());
+    phi::DenseTensor transformed_input_channel(input->dtype());
+    phi::DenseTensor transformed_output(output->dtype());
     transformed_input_channel = *input;
     transformed_output = *output;
     T* output_data = transformed_output.data<T>();
@@ -99,7 +98,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     int data_dim = strides.size();  // 2d or 3d
     bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim);
 
-    Tensor transformed_input;
+    phi::DenseTensor transformed_input;
     std::vector<int> padding_common(data_dim, 0);
     if (!is_sys_pad) {
       std::vector<int> padding_diff(data_dim);
@@ -144,7 +143,8 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         } break;
         default:
           PADDLE_THROW(platform::errors::PermissionDenied(
-              "Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
+              "Operator Conv2DFusion expects Input to be a 4-D or 5-D "
+              "phi::DenseTensor. "
               "But received the actual dimension = %d, shape = [%s].",
               rank,
               transformed_input_channel.dims()));
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 7c9af00955963..cbf098819212f 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -31,7 +31,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(batch_norm);
 PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
@@ -149,15 +148,15 @@ void ComputeInplaceRelu(phi::DenseTensor *cpu_x) {
 }
 
 void ComputeBatchNormForward(const phi::GPUContext &ctx,
-                             const Tensor &cpu_x,
-                             const Tensor &cpu_scale,
-                             const Tensor &cpu_bias,
-                             Tensor *cpu_mean,
-                             Tensor *cpu_var,
-                             Tensor *cpu_saved_mean,
-                             Tensor *cpu_saved_var,
-                             Tensor *cpu_y,
-                             Tensor *saved_reserve_space) {
+                             const phi::DenseTensor &cpu_x,
+                             const phi::DenseTensor &cpu_scale,
+                             const phi::DenseTensor &cpu_bias,
+                             phi::DenseTensor *cpu_mean,
+                             phi::DenseTensor *cpu_var,
+                             phi::DenseTensor *cpu_saved_mean,
+                             phi::DenseTensor *cpu_saved_var,
+                             phi::DenseTensor *cpu_y,
+                             phi::DenseTensor *saved_reserve_space) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *scale = scope.Var("Scale")->GetMutable<phi::DenseTensor>();
@@ -215,16 +214,16 @@ void ComputeBatchNormForward(const phi::GPUContext &ctx,
 }
 
 void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx,
-                                  const Tensor &cpu_x,
-                                  const Tensor &cpu_z,
-                                  const Tensor &cpu_scale,
-                                  const Tensor &cpu_bias,
-                                  Tensor *cpu_mean,
-                                  Tensor *cpu_var,
-                                  Tensor *cpu_saved_mean,
-                                  Tensor *cpu_saved_var,
-                                  Tensor *cpu_y,
-                                  Tensor *saved_reserve_space) {
+                                  const phi::DenseTensor &cpu_x,
+                                  const phi::DenseTensor &cpu_z,
+                                  const phi::DenseTensor &cpu_scale,
+                                  const phi::DenseTensor &cpu_bias,
+                                  phi::DenseTensor *cpu_mean,
+                                  phi::DenseTensor *cpu_var,
+                                  phi::DenseTensor *cpu_saved_mean,
+                                  phi::DenseTensor *cpu_saved_var,
+                                  phi::DenseTensor *cpu_y,
+                                  phi::DenseTensor *saved_reserve_space) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *z = scope.Var("Z")->GetMutable<phi::DenseTensor>();
@@ -278,18 +277,18 @@ void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx,
 }
 
 void ComputeFusedBNAddReluBackward(const phi::GPUContext &ctx,
-                                   const Tensor &cpu_dy,
-                                   const Tensor &cpu_x,
-                                   const Tensor &cpu_scale,
-                                   const Tensor &cpu_bias,
-                                   const Tensor &cpu_saved_mean,
-                                   const Tensor &cpu_saved_var,
-                                   const Tensor &cpu_y,
-                                   const Tensor &saved_reserve_space,
-                                   Tensor *cpu_dx,
-                                   Tensor *cpu_dz,
-                                   Tensor *cpu_dscale,
-                                   Tensor *cpu_dbias) {
+                                   const phi::DenseTensor &cpu_dy,
+                                   const phi::DenseTensor &cpu_x,
+                                   const phi::DenseTensor &cpu_scale,
+                                   const phi::DenseTensor &cpu_bias,
+                                   const phi::DenseTensor &cpu_saved_mean,
+                                   const phi::DenseTensor &cpu_saved_var,
+                                   const phi::DenseTensor &cpu_y,
+                                   const phi::DenseTensor &saved_reserve_space,
+                                   phi::DenseTensor *cpu_dx,
+                                   phi::DenseTensor *cpu_dz,
+                                   phi::DenseTensor *cpu_dscale,
+                                   phi::DenseTensor *cpu_dbias) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<phi::DenseTensor>();
   auto *y = scope.Var("Y")->GetMutable<phi::DenseTensor>();
@@ -383,7 +382,9 @@ class CudnnBNAddReluTester {
     phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-    auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
+    auto select = [&](phi::DenseTensor *in) {
+      return has_shortcut_ ? in : nullptr;
+    };
 
     phi::DenseTensor cpu_mean_base_x;
     phi::DenseTensor cpu_var_base_x;
@@ -506,10 +507,10 @@ class CudnnBNAddReluTester {
     InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_dy_);
   }
 
-  void InitMeanVar(Tensor *cpu_mean,
-                   Tensor *cpu_var,
-                   Tensor *cpu_saved_mean,
-                   Tensor *cpu_saved_var) {
+  void InitMeanVar(phi::DenseTensor *cpu_mean,
+                   phi::DenseTensor *cpu_var,
+                   phi::DenseTensor *cpu_saved_mean,
+                   phi::DenseTensor *cpu_saved_var) {
     InitConstantTensor<float>({channels_}, static_cast<float>(0.0f), cpu_mean);
     InitConstantTensor<float>({channels_}, static_cast<float>(1.0f), cpu_var);
     InitConstantTensor<float>(
@@ -519,17 +520,17 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineForward(const phi::GPUContext &ctx,
-                       Tensor *cpu_mean_x,
-                       Tensor *cpu_var_x,
-                       Tensor *cpu_saved_mean_x,
-                       Tensor *cpu_saved_var_x,
-                       Tensor *cpu_y,
-                       Tensor *saved_reserve_space_x,
-                       Tensor *cpu_mean_z = nullptr,
-                       Tensor *cpu_var_z = nullptr,
-                       Tensor *cpu_saved_mean_z = nullptr,
-                       Tensor *cpu_saved_var_z = nullptr,
-                       Tensor *saved_reserve_space_z = nullptr) {
+                       phi::DenseTensor *cpu_mean_x,
+                       phi::DenseTensor *cpu_var_x,
+                       phi::DenseTensor *cpu_saved_mean_x,
+                       phi::DenseTensor *cpu_saved_var_x,
+                       phi::DenseTensor *cpu_y,
+                       phi::DenseTensor *saved_reserve_space_x,
+                       phi::DenseTensor *cpu_mean_z = nullptr,
+                       phi::DenseTensor *cpu_var_z = nullptr,
+                       phi::DenseTensor *cpu_saved_mean_z = nullptr,
+                       phi::DenseTensor *cpu_saved_var_z = nullptr,
+                       phi::DenseTensor *saved_reserve_space_z = nullptr) {
     InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
     ComputeBatchNormForward(ctx,
                             cpu_x_,
@@ -566,12 +567,12 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineForwardFusedBNAddRelu(const phi::GPUContext &ctx,
-                                     Tensor *cpu_mean,
-                                     Tensor *cpu_var,
-                                     Tensor *cpu_saved_mean,
-                                     Tensor *cpu_saved_var,
-                                     Tensor *cpu_y,
-                                     Tensor *saved_reserve_space) {
+                                     phi::DenseTensor *cpu_mean,
+                                     phi::DenseTensor *cpu_var,
+                                     phi::DenseTensor *cpu_saved_mean,
+                                     phi::DenseTensor *cpu_saved_var,
+                                     phi::DenseTensor *cpu_y,
+                                     phi::DenseTensor *saved_reserve_space) {
     InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
     ComputeFusedBNAddReluForward(ctx,
                                  cpu_x_,
@@ -587,10 +588,10 @@ class CudnnBNAddReluTester {
   }
 
   void BaselineBackwardFusedBNAddRelu(const phi::GPUContext &ctx,
-                                      Tensor *cpu_dx,
-                                      Tensor *cpu_dz,
-                                      Tensor *cpu_dscale,
-                                      Tensor *cpu_dbias) {
+                                      phi::DenseTensor *cpu_dx,
+                                      phi::DenseTensor *cpu_dz,
+                                      phi::DenseTensor *cpu_dscale,
+                                      phi::DenseTensor *cpu_dbias) {
     ComputeFusedBNAddReluBackward(ctx,
                                   cpu_dy_,
                                   cpu_x_,
@@ -607,19 +608,19 @@ class CudnnBNAddReluTester {
   }
 
   void ComputeFusedBNStatsFinalize(const phi::GPUContext &ctx,
-                                   const Tensor &cpu_x,
-                                   const Tensor &cpu_bn_scale,
-                                   const Tensor &cpu_bn_bias,
-                                   Tensor *sum,
-                                   Tensor *sum_of_square,
-                                   Tensor *bn_scale,
-                                   Tensor *bn_bias,
-                                   Tensor *mean,
-                                   Tensor *var,
-                                   Tensor *saved_mean,
-                                   Tensor *saved_var,
-                                   Tensor *equiv_scale,
-                                   Tensor *equiv_bias) {
+                                   const phi::DenseTensor &cpu_x,
+                                   const phi::DenseTensor &cpu_bn_scale,
+                                   const phi::DenseTensor &cpu_bn_bias,
+                                   phi::DenseTensor *sum,
+                                   phi::DenseTensor *sum_of_square,
+                                   phi::DenseTensor *bn_scale,
+                                   phi::DenseTensor *bn_bias,
+                                   phi::DenseTensor *mean,
+                                   phi::DenseTensor *var,
+                                   phi::DenseTensor *saved_mean,
+                                   phi::DenseTensor *saved_var,
+                                   phi::DenseTensor *equiv_scale,
+                                   phi::DenseTensor *equiv_bias) {
     phi::DenseTensor cpu_sum;
     phi::DenseTensor cpu_sum_of_square;
     ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
@@ -664,16 +665,16 @@ class CudnnBNAddReluTester {
 
   // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
   void FusedForward(const phi::GPUContext &ctx,
-                    Tensor *cpu_mean_x,
-                    Tensor *cpu_var_x,
-                    Tensor *cpu_saved_mean_x,
-                    Tensor *cpu_saved_var_x,
-                    Tensor *cpu_y,
-                    Tensor *cpu_bitmask,
-                    Tensor *cpu_mean_z = nullptr,
-                    Tensor *cpu_var_z = nullptr,
-                    Tensor *cpu_saved_mean_z = nullptr,
-                    Tensor *cpu_saved_var_z = nullptr) {
+                    phi::DenseTensor *cpu_mean_x,
+                    phi::DenseTensor *cpu_var_x,
+                    phi::DenseTensor *cpu_saved_mean_x,
+                    phi::DenseTensor *cpu_saved_var_x,
+                    phi::DenseTensor *cpu_y,
+                    phi::DenseTensor *cpu_bitmask,
+                    phi::DenseTensor *cpu_mean_z = nullptr,
+                    phi::DenseTensor *cpu_var_z = nullptr,
+                    phi::DenseTensor *cpu_saved_mean_z = nullptr,
+                    phi::DenseTensor *cpu_saved_var_z = nullptr) {
     phi::DenseTensor x;
     phi::DenseTensor sum_x;
     phi::DenseTensor sum_of_square_x;
@@ -802,10 +803,10 @@ class CudnnBNAddReluTester {
 
   // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
   void FusedBackward(const phi::GPUContext &ctx,
-                     Tensor *cpu_dx,
-                     Tensor *cpu_dz,
-                     Tensor *cpu_dscale,
-                     Tensor *cpu_dbias) {
+                     phi::DenseTensor *cpu_dx,
+                     phi::DenseTensor *cpu_dz,
+                     phi::DenseTensor *cpu_dscale,
+                     phi::DenseTensor *cpu_dbias) {
     phi::DenseTensor dy;
     phi::DenseTensor x;
     phi::DenseTensor bn_scale;
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 0325a0e585ed3..762e86406917d 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 template <typename T>
 using BatchNormParamType =
@@ -70,16 +69,16 @@ class CudnnBNStatsFinalize {
   ~CudnnBNStatsFinalize() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &sum,
-               const Tensor &sum_of_squares,
-               const Tensor &scale,
-               const Tensor &bias,
-               Tensor *saved_mean,
-               Tensor *saved_invstd,
-               Tensor *running_mean,
-               Tensor *running_var,
-               Tensor *equiv_scale,
-               Tensor *equiv_bias,
+               const phi::DenseTensor &sum,
+               const phi::DenseTensor &sum_of_squares,
+               const phi::DenseTensor &scale,
+               const phi::DenseTensor &bias,
+               phi::DenseTensor *saved_mean,
+               phi::DenseTensor *saved_invstd,
+               phi::DenseTensor *running_mean,
+               phi::DenseTensor *running_var,
+               phi::DenseTensor *equiv_scale,
+               phi::DenseTensor *equiv_bias,
                double eps,
                float momentum,
                int64_t ele_count,
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index bf0e06b825e4b..c82ccc959d204 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 namespace dynload = platform::dynload;
 
 template <typename T>
@@ -195,11 +194,11 @@ class CudnnNormConvolution {
   ~CudnnNormConvolution() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &input,
-               const Tensor &filter,
-               Tensor *output,
-               Tensor *sum,
-               Tensor *sum_of_squares) {
+               const phi::DenseTensor &input,
+               const phi::DenseTensor &filter,
+               phi::DenseTensor *output,
+               phi::DenseTensor *sum,
+               phi::DenseTensor *sum_of_squares) {
     auto cudnn_handle = ctx.cudnn_handle();
 
     CudnnFusionOp *fwd_op = GetForwardOp(ctx);
@@ -314,11 +313,11 @@ class CudnnNormConvolutionGrad {
   ~CudnnNormConvolutionGrad() {}
 
   void Backward(const phi::GPUContext &ctx,
-                const Tensor &input,
-                const Tensor &filter,
-                const Tensor &output_grad,
-                Tensor *input_grad,
-                Tensor *filter_grad,
+                const phi::DenseTensor &input,
+                const phi::DenseTensor &filter,
+                const phi::DenseTensor &output_grad,
+                phi::DenseTensor *input_grad,
+                phi::DenseTensor *filter_grad,
                 bool use_addto = false) {
     T *input_ptr = const_cast<T *>(input.data<T>());
     T *filter_ptr = const_cast<T *>(filter.data<T>());
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 3369a8ca4a9c5..4f7555aed8282 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace op = paddle::operators;
-using Tensor = phi::DenseTensor;
 
 USE_OP_ITSELF(conv2d);
 USE_OP_ITSELF(conv2d_grad);
@@ -95,9 +94,9 @@ void CheckOutput(const phi::DenseTensor &cpu_res,
 
 // Use Paddle conv2d op results as baseline
 void ComputeConv2DForward(const phi::GPUContext &ctx,
-                          const Tensor &cpu_input,
-                          const Tensor &cpu_filter,
-                          Tensor *cpu_output,
+                          const phi::DenseTensor &cpu_input,
+                          const phi::DenseTensor &cpu_filter,
+                          phi::DenseTensor *cpu_output,
                           int stride,
                           int padding) {
   framework::Scope scope;
@@ -131,9 +130,9 @@ void ComputeConv2DForward(const phi::GPUContext &ctx,
 
 // Use Paddle conv2d_grad op results as baseline
 void ComputeConv2DBackward(const phi::GPUContext &ctx,
-                           const Tensor &cpu_input,
-                           const Tensor &cpu_filter,
-                           const Tensor &cpu_output_grad,
+                           const phi::DenseTensor &cpu_input,
+                           const phi::DenseTensor &cpu_filter,
+                           const phi::DenseTensor &cpu_output_grad,
                            phi::DenseTensor *cpu_input_grad,
                            phi::DenseTensor *cpu_filter_grad,
                            int stride,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index df79ed758dbc5..4ecc5795ff41a 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 namespace dynload = platform::dynload;
@@ -117,14 +116,14 @@ class CudnnScaleBiasAddRelu {
   ~CudnnScaleBiasAddRelu() {}
 
   void Forward(const phi::GPUContext &ctx,
-               const Tensor &x,
-               const Tensor &x_scale,
-               const Tensor &x_bias,
-               const Tensor *z,
-               const Tensor *z_scale,
-               const Tensor *z_bias,
-               Tensor *out,
-               Tensor *bitmask) {
+               const phi::DenseTensor &x,
+               const phi::DenseTensor &x_scale,
+               const phi::DenseTensor &x_bias,
+               const phi::DenseTensor *z,
+               const phi::DenseTensor *z_scale,
+               const phi::DenseTensor *z_bias,
+               phi::DenseTensor *out,
+               phi::DenseTensor *bitmask) {
     ForwardInit(ctx);
     auto handle = ctx.cudnn_handle();
     auto workspace_handle = ctx.cudnn_workspace_handle();
@@ -172,17 +171,17 @@ class CudnnScaleBiasAddRelu {
   }
 
   void Backward(const phi::GPUContext &ctx,
-                const Tensor &dy,
-                const Tensor &x,
-                const Tensor &scale,
-                const Tensor &bias,
-                const Tensor &saved_mean,
-                const Tensor &saved_invstd,
-                const Tensor *bitmask,
-                Tensor *dx,
-                Tensor *dz,
-                Tensor *dscale,
-                Tensor *dbias,
+                const phi::DenseTensor &dy,
+                const phi::DenseTensor &x,
+                const phi::DenseTensor &scale,
+                const phi::DenseTensor &bias,
+                const phi::DenseTensor &saved_mean,
+                const phi::DenseTensor &saved_invstd,
+                const phi::DenseTensor *bitmask,
+                phi::DenseTensor *dx,
+                phi::DenseTensor *dz,
+                phi::DenseTensor *dscale,
+                phi::DenseTensor *dbias,
                 double eps) {
     BackwardInit(ctx);
     auto handle = ctx.cudnn_handle();
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 11939a454b9a0..47459884cc544 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AttnDropoutParam {
  public:
   AttnDropoutParam() {
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 03c97ec345fb8..b05a63510e385 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedAttentionOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index ef5087f0534e1..9454e589ec920 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -38,8 +38,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
@@ -528,7 +526,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     int input_size = dim_embed;
 
     bool add_residual = ctx.Attr<bool>("add_residual");
-    Tensor d_residual;
+    phi::DenseTensor d_residual;
     T *d_residual_data = nullptr;
     if (add_residual) {
       d_residual.Resize(input_x_dims);
@@ -728,8 +726,8 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
 
     if (add_residual) {
       // gradient accumulation
-      std::vector<const Tensor *> ins = {&d_residual, d_x};
-      std::vector<Tensor *> outs = {d_x};
+      std::vector<const phi::DenseTensor *> ins = {&d_residual, d_x};
+      std::vector<phi::DenseTensor *> outs = {d_x};
       phi::funcs::ElementwiseKernel<T>(
           ctx.cuda_device_context(), ins, &outs, phi::funcs::AddFunctor<T>());
     }
diff --git a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
index 6bf2e3d80335f..bbfa48f1dca78 100644
--- a/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op_xpu.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedAttentionOpKernel : public framework::OpKernel<T> {
  public:
@@ -33,86 +31,88 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     using XPUTypeT = typename XPUTypeTrait<T>::Type;
 
     // inputs tensor
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
 
     const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
 
     // shape [3, num_head, dim_head, dim_embed]
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
     // shape [3 , num_head, dim_head]
-    auto *qkv_bias = ctx.Input<Tensor>("QKVBias");
+    auto *qkv_bias = ctx.Input<phi::DenseTensor>("QKVBias");
 
     // shape [batch_size, 1, 1, seq_len]
-    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto *src_mask = ctx.Input<phi::DenseTensor>("SrcMask");
 
     // shape [dim_embed, dim_embed]
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
     // shape [dim_embed]
-    auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+    auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
 
-    const Tensor *ln_scale = nullptr;
-    const Tensor *ln_bias = nullptr;
+    const phi::DenseTensor *ln_scale = nullptr;
+    const phi::DenseTensor *ln_bias = nullptr;
     float epsilon = 0.0f;
 
     if (pre_layer_norm) {
-      ln_scale = ctx.Input<Tensor>("LnScale");
-      ln_bias = ctx.Input<Tensor>("LnBias");
+      ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+      ln_bias = ctx.Input<phi::DenseTensor>("LnBias");
       epsilon = ctx.Attr<float>("epsilon");
     } else {
-      ln_scale = ctx.Input<Tensor>("Ln2Scale");
-      ln_bias = ctx.Input<Tensor>("Ln2Bias");
+      ln_scale = ctx.Input<phi::DenseTensor>("Ln2Scale");
+      ln_bias = ctx.Input<phi::DenseTensor>("Ln2Bias");
       epsilon = ctx.Attr<float>("ln_epsilon");
     }
 
     // outputs tensor
     // qkv 的值，并已经做了transpos后的值
     // shape [3, batch_size, num_head, seq_len, dim_head]
-    auto *TransposeOut2 = ctx.Output<Tensor>("TransposeOut2");
+    auto *TransposeOut2 = ctx.Output<phi::DenseTensor>("TransposeOut2");
 
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *softmax_out = ctx.Output<phi::DenseTensor>("SoftmaxOut");
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *attn_dropout_mask_out = ctx.Output<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_mask_out =
+        ctx.Output<phi::DenseTensor>("AttnDropoutMaskOut");
     // shape [batch_size, num_head, seq_len, seq_len]
-    auto *attn_dropout_out = ctx.Output<Tensor>("AttnDropoutOut");
+    auto *attn_dropout_out = ctx.Output<phi::DenseTensor>("AttnDropoutOut");
 
     // shape [[batch_size, seq_len, num_head, dim_head]]
-    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+    auto *fmha_out = ctx.Output<phi::DenseTensor>("FMHAOut");
 
     // shape [batch_size, seq_len, dim_embed]
-    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *dropout_mask_out = ctx.Output<phi::DenseTensor>("DropoutMaskOut");
 
     // final output
     // shape [batch_size, seq_len, dim_embed]
-    auto *out = ctx.Output<Tensor>("Y");
+    auto *out = ctx.Output<phi::DenseTensor>("Y");
 
     // 下面这个tensor是不需要返回, 但是新的动态图需要
-    auto *QKOut = ctx.Output<Tensor>("QKOut");
+    auto *QKOut = ctx.Output<phi::DenseTensor>("QKOut");
     QKOut->mutable_data<T>(ctx.GetPlace());
-    auto *QKTVOut = ctx.Output<Tensor>("QKTVOut");
+    auto *QKTVOut = ctx.Output<phi::DenseTensor>("QKTVOut");
     QKTVOut->mutable_data<T>(ctx.GetPlace());
-    auto *OutLinearOut = ctx.Output<Tensor>("OutLinearOut");
+    auto *OutLinearOut = ctx.Output<phi::DenseTensor>("OutLinearOut");
     OutLinearOut->mutable_data<T>(ctx.GetPlace());
-    auto *QKVBiasOut = ctx.Output<Tensor>("QKVBiasOut");
+    auto *QKVBiasOut = ctx.Output<phi::DenseTensor>("QKVBiasOut");
     QKVBiasOut->mutable_data<T>(ctx.GetPlace());
-    auto *SrcMaskOut = ctx.Output<Tensor>("SrcMaskOut");
+    auto *SrcMaskOut = ctx.Output<phi::DenseTensor>("SrcMaskOut");
     SrcMaskOut->mutable_data<T>(ctx.GetPlace());
-    auto *qkv_out = ctx.Output<Tensor>("QKVOut");
+    auto *qkv_out = ctx.Output<phi::DenseTensor>("QKVOut");
     qkv_out->mutable_data<T>(ctx.GetPlace());
 
-    Tensor *bias_dropout_residual_out = nullptr;
-    Tensor *ln_mean = nullptr;
-    Tensor *ln_var = nullptr;
-    Tensor *ln_out = nullptr;
+    phi::DenseTensor *bias_dropout_residual_out = nullptr;
+    phi::DenseTensor *ln_mean = nullptr;
+    phi::DenseTensor *ln_var = nullptr;
+    phi::DenseTensor *ln_out = nullptr;
 
     if (pre_layer_norm) {
-      ln_mean = ctx.Output<Tensor>("LnMean");
-      ln_var = ctx.Output<Tensor>("LnVariance");
-      ln_out = ctx.Output<Tensor>("LnOut");
+      ln_mean = ctx.Output<phi::DenseTensor>("LnMean");
+      ln_var = ctx.Output<phi::DenseTensor>("LnVariance");
+      ln_out = ctx.Output<phi::DenseTensor>("LnOut");
     } else {
-      ln_mean = ctx.Output<Tensor>("Ln2Mean");
-      ln_var = ctx.Output<Tensor>("Ln2Variance");
-      bias_dropout_residual_out = ctx.Output<Tensor>("BiasDropoutResidualOut");
+      ln_mean = ctx.Output<phi::DenseTensor>("Ln2Mean");
+      ln_var = ctx.Output<phi::DenseTensor>("Ln2Variance");
+      bias_dropout_residual_out =
+          ctx.Output<phi::DenseTensor>("BiasDropoutResidualOut");
     }
 
     // dropout info
@@ -125,7 +125,8 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
 
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
 
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
 
@@ -468,7 +469,8 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
         (dropout_implementation_1 == "upscale_in_train");
-    auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input<Tensor>("Seed1") : nullptr;
+    auto *seed_1 =
+        ctx.HasInput("Seed1") ? ctx.Input<phi::DenseTensor>("Seed1") : nullptr;
     bool is_fix_seed_1 = ctx.Attr<bool>("attn_dropout_fix_seed");
     int seed_val_1 = ctx.Attr<int>("attn_dropout_seed");
 
@@ -482,79 +484,81 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
 
     XPUDropoutParam dropout_param(ctx, 0);
     // get inputs.
-    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
     const XPUTypeT *d_y_ptr =
         reinterpret_cast<const XPUTypeT *>(d_y->data<T>());
     // 前向必要参数
-    auto *input_x = ctx.Input<Tensor>("X");
+    auto *input_x = ctx.Input<phi::DenseTensor>("X");
     const XPUTypeT *input_x_ptr =
         reinterpret_cast<const XPUTypeT *>(input_x->data<T>());
-    auto *qkv_transpose_out = ctx.Input<Tensor>("TransposeOut2");
+    auto *qkv_transpose_out = ctx.Input<phi::DenseTensor>("TransposeOut2");
     const XPUTypeT *qkv_transpose_out_ptr =
         reinterpret_cast<const XPUTypeT *>(qkv_transpose_out->data<T>());
-    auto *qkv_weight = ctx.Input<Tensor>("QKVW");
+    auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVW");
     const XPUTypeT *qkv_weight_ptr =
         reinterpret_cast<const XPUTypeT *>(qkv_weight->data<T>());
 
-    auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    auto *softmax_out = ctx.Input<phi::DenseTensor>("SoftmaxOut");
     const XPUTypeT *softmax_out_ptr =
         reinterpret_cast<const XPUTypeT *>(softmax_out->data<T>());
-    auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
+    auto *attn_dropout_out = ctx.Input<phi::DenseTensor>("AttnDropoutOut");
     const XPUTypeT *attn_dropout_out_ptr =
         reinterpret_cast<const XPUTypeT *>(attn_dropout_out->data<T>());
 
-    auto *attn_dropout_mask = ctx.Input<Tensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_mask = ctx.Input<phi::DenseTensor>("AttnDropoutMaskOut");
     const XPUTypeT *attn_dropout_mask_ptr =
         reinterpret_cast<const XPUTypeT *>(attn_dropout_mask->data<T>());
-    auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+    auto *fmha_out = ctx.Input<phi::DenseTensor>("FMHAOut");
     const XPUTypeT *fmha_out_ptr =
         reinterpret_cast<const XPUTypeT *>(fmha_out->data<T>());
 
-    auto *out_linear_weight = ctx.Input<Tensor>("OutLinearW");
+    auto *out_linear_weight = ctx.Input<phi::DenseTensor>("OutLinearW");
     const XPUTypeT *out_linear_weight_ptr =
         reinterpret_cast<const XPUTypeT *>(out_linear_weight->data<T>());
 
-    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *dropout_mask_out = ctx.Input<phi::DenseTensor>("DropoutMaskOut");
     const XPUTypeT *dropout_mask_out_ptr =
         reinterpret_cast<const XPUTypeT *>(dropout_mask_out->data<T>());
     // 需要计算的梯度
-    auto *d_qkv_weight = ctx.Output<Tensor>(framework::GradVarName("QKVW"));
+    auto *d_qkv_weight =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVW"));
     XPUTypeT *d_qkv_weight_ptr = reinterpret_cast<XPUTypeT *>(
         d_qkv_weight->mutable_data<T>(ctx.GetPlace()));
 
-    auto *d_qkv_bias = ctx.Output<Tensor>(framework::GradVarName("QKVBias"));
+    auto *d_qkv_bias =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("QKVBias"));
     XPUTypeT *d_qkv_bias_ptr = reinterpret_cast<XPUTypeT *>(
         d_qkv_bias->mutable_data<T>(ctx.GetPlace()));
     auto *d_out_linear_weight =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearW"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearW"));
 
     XPUTypeT *d_out_linear_weight_ptr = reinterpret_cast<XPUTypeT *>(
         d_out_linear_weight->mutable_data<T>(ctx.GetPlace()));
 
     auto *d_out_linear_bias =
-        ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("OutLinearBias"));
     XPUTypeT *d_out_linear_bias_ptr = reinterpret_cast<XPUTypeT *>(
         d_out_linear_bias->mutable_data<T>(ctx.GetPlace()));
     // 有可能需要
     auto *d_src_mask_out =
-        ctx.Output<Tensor>(framework::GradVarName("SrcMaskOut"));
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("SrcMaskOut"));
     XPUTypeT *d_src_mask_out_ptr =
         (d_src_mask_out == nullptr)
             ? (nullptr)
             : (reinterpret_cast<XPUTypeT *>(
                   d_src_mask_out->mutable_data<T>(ctx.GetPlace())));
     // 输出 dx
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     XPUTypeT *d_x_ptr =
         reinterpret_cast<XPUTypeT *>(d_x->mutable_data<T>(ctx.GetPlace()));
 
-    const Tensor *ln_out = nullptr;
-    const Tensor *bias_dropout_residual_out = nullptr;
-    const Tensor *ln_scale = nullptr;
-    const Tensor *ln_mean = nullptr;
-    const Tensor *ln_var = nullptr;
-    Tensor *d_ln_scale = nullptr;
-    Tensor *d_ln_bias = nullptr;
+    const phi::DenseTensor *ln_out = nullptr;
+    const phi::DenseTensor *bias_dropout_residual_out = nullptr;
+    const phi::DenseTensor *ln_scale = nullptr;
+    const phi::DenseTensor *ln_mean = nullptr;
+    const phi::DenseTensor *ln_var = nullptr;
+    phi::DenseTensor *d_ln_scale = nullptr;
+    phi::DenseTensor *d_ln_bias = nullptr;
 
     const XPUTypeT *ln_out_ptr = NULL;
     const float *ln_scale_ptr = NULL;
@@ -567,23 +571,28 @@ class FusedAttentionGradXPUKernel : public framework::OpKernel<T> {
     float epsilon = 0.0f;
 
     if (pre_layer_norm) {
-      ln_out = ctx.Input<Tensor>("LnOut");
+      ln_out = ctx.Input<phi::DenseTensor>("LnOut");
       ln_out_ptr = reinterpret_cast<const XPUTypeT *>(ln_out->data<T>());
-      ln_scale = ctx.Input<Tensor>("LnScale");
-      ln_mean = ctx.Input<Tensor>("LnMean");
-      ln_var = ctx.Input<Tensor>("LnVariance");
+      ln_scale = ctx.Input<phi::DenseTensor>("LnScale");
+      ln_mean = ctx.Input<phi::DenseTensor>("LnMean");
+      ln_var = ctx.Input<phi::DenseTensor>("LnVariance");
       epsilon = ctx.Attr<float>("epsilon");
-      d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
-      d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+      d_ln_scale =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnScale"));
+      d_ln_bias =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("LnBias"));
 
     } else {
-      ln_scale = ctx.Input<Tensor>("Ln2Scale");
-      ln_mean = ctx.Input<Tensor>("Ln2Mean");
-      ln_var = ctx.Input<Tensor>("Ln2Variance");
+      ln_scale = ctx.Input<phi::DenseTensor>("Ln2Scale");
+      ln_mean = ctx.Input<phi::DenseTensor>("Ln2Mean");
+      ln_var = ctx.Input<phi::DenseTensor>("Ln2Variance");
       epsilon = ctx.Attr<float>("ln_epsilon");
-      d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("Ln2Scale"));
-      d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("Ln2Bias"));
-      bias_dropout_residual_out = ctx.Input<Tensor>("BiasDropoutResidualOut");
+      d_ln_scale =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Scale"));
+      d_ln_bias =
+          ctx.Output<phi::DenseTensor>(framework::GradVarName("Ln2Bias"));
+      bias_dropout_residual_out =
+          ctx.Input<phi::DenseTensor>("BiasDropoutResidualOut");
       bias_dropout_residual_out_ptr = reinterpret_cast<const XPUTypeT *>(
           bias_dropout_residual_out->data<T>());
     }
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 94131197060b5..02494e33e1241 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
index 664e20b686d7e..2562c2cc22575 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index faf4a1aae44b6..e68be43eb7ec0 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -304,9 +304,9 @@ framework::OpKernelType FusedBatchNormActGradOp::GetExpectedKernelType(
     PADDLE_THROW(platform::errors::NotFound(
         "Can not find Y@GRAD in the execution context."));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index c7fbdc88abb33..4023aaa8445f9 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -30,7 +30,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -143,7 +142,7 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
     size_t reserve_space_size = 0;
     void *reserve_space_ptr = nullptr;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
@@ -340,7 +339,7 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
 
     size_t workspace_size = 0;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     auto reserve_space_size = reserve_space->memory_size();
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ACTIVATION;
     platform::ScopedActivationDescriptor scope_act_desc;
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.h b/paddle/fluid/operators/fused/fused_bn_activation_op.h
index f8aab994cb371..b71812db9d3d3 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedBatchNormActOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index 2d51a3efaf699..08f7087b48d01 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -262,9 +262,9 @@ framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
     PADDLE_THROW(platform::errors::NotFound(
         "Can not find Y@GRAD in the execution context."));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 5a192b2df5c94..4c4756b8e1979 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -30,7 +30,6 @@ DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
@@ -120,7 +119,7 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
     size_t reserve_space_size = 0;
     void *reserve_space_ptr = nullptr;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     // Create reserve space and workspace for batch norm.
     // Create tensor for each batchnorm op, it will be used in the
     // backward. Thus this tensor shouldn't be temp.
@@ -296,7 +295,7 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
 
     size_t workspace_size = 0;
     void *workspace_ptr = nullptr;
-    Tensor workspace_tensor;
+    phi::DenseTensor workspace_tensor;
     auto reserve_space_size = reserve_space->memory_size();
     cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
     platform::ScopedActivationDescriptor scope_act_desc;
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index f4913bca3df98..bdb1f2f35444c 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 8360f07a5f3e7..b8f2cc5b4b335 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -33,7 +33,6 @@ template <typename DeviceContext, typename T>
 class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto &device_ctx = context.template device_context<DeviceContext>();
     auto ids = context.MultiInput<phi::DenseTensor>("Ids");
     auto embs = context.MultiInput<phi::DenseTensor>("Embs");
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index 9c58c6900959e..885f3412a4e06 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -182,16 +182,17 @@ void FusedEmbeddingFCLSTMOpMaker::Make() {
            "contains the ids to be looked up in W. "
            "The last dimension size must be 1.");
   AddInput("Embeddings",
-           "(Tensor) the learnable weights of X."
+           "(phi::DenseTensor) the learnable weights of X."
            " - The shape is (M x 4D), where M is the dim size of x, D is the "
            "hidden size. "
            " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) same as LSTMOp, the learnable hidden-hidden weights."
+      " - The shape is (D x 4D), where D is the hidden size. "
+      " - Weight = {W_ch, W_ih, W_fh, W_oh}");
   AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "(phi::DenseTensor) the learnable weights. Almost same as LSTMOp"
            "Note: we should add the fc bias into this (1x4D) in bias."
            "input-hidden bias weight and peephole connections weight if "
            "setting `use_peepholes` True. "
@@ -202,13 +203,15 @@ void FusedEmbeddingFCLSTMOpMaker::Make() {
            " - The shape is (1 x 7D). "
            " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
   AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) the initial hidden "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size and D is the hidden size.")
       .AsDispensable();
   AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) (the initial cell "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size. `H0` and `C0` can be NULL but only at the same time.")
@@ -318,7 +321,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
   /* diagonal weight*/                                               \
   const T* wc_data = bias->data<T>() + D4;                           \
   /* for peephole only*/                                             \
-  Tensor checked_cell;                                               \
+  phi::DenseTensor checked_cell;                                     \
   T* checked_cell_data = nullptr;                                    \
   auto place = ctx.GetPlace();                                       \
   if (use_peepholes) {                                               \
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
index 181fa06b02034..19039ec55946d 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 0e4134d428094..9fa62a3704547 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
@@ -175,7 +174,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
       auto len = ids_t->numel();
       int idx_width = len / offset.back();
 
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
+      phi::DenseTensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
       csr_vals_t.Resize({len});
       csr_colmuns_t.Resize({len});
       csr_row_idx_t.Resize({(batch_size + 1) * idx_width});
@@ -300,7 +299,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
       auto len = ids->numel();
       int idx_width = len / offset.back();
 
-      Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
+      phi::DenseTensor csr_vals_t, csr_colmuns_t, csr_row_idx_t;
       csr_vals_t.Resize({len});
       csr_colmuns_t.Resize({len});
       int64_t batch_size = ids_lod[0].size() - 1;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index aaf84c7b1eadb..3bf039829ac3d 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedFeedForwardOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 669672084b52b..28a9cb167e093 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -31,8 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void AllReduce(phi::DenseTensor& tensor,  // NOLINT
                       const int ring_id,
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
index b94d37a921fb6..4b9ba95143345 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op_xpu.cc
@@ -26,30 +26,28 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedFeedForwardXPUKernel : public framework::OpKernel<T> {
   using XPUTypeT = typename XPUTypeTrait<T>::Type;
 
  public:
   void FFN(const phi::XPUContext& dev_ctx,
-           const Tensor* x,
-           const Tensor* linear1_weight,
-           const Tensor* linear1_bias,
-           const Tensor* linear2_weight,
-           const Tensor* linear2_bias,
-           const Tensor* ln_scale,
-           const Tensor* ln_bias,
-           Tensor* out,
-           Tensor* dropout1_mask,
-           Tensor* dropout2_mask,
-           Tensor* ln_mean,
-           Tensor* ln_variance,
-           Tensor* linear1_out,
-           Tensor* ln1_out,
-           Tensor* dropout1_out,
-           Tensor* dropout2_out,
+           const phi::DenseTensor* x,
+           const phi::DenseTensor* linear1_weight,
+           const phi::DenseTensor* linear1_bias,
+           const phi::DenseTensor* linear2_weight,
+           const phi::DenseTensor* linear2_bias,
+           const phi::DenseTensor* ln_scale,
+           const phi::DenseTensor* ln_bias,
+           phi::DenseTensor* out,
+           phi::DenseTensor* dropout1_mask,
+           phi::DenseTensor* dropout2_mask,
+           phi::DenseTensor* ln_mean,
+           phi::DenseTensor* ln_variance,
+           phi::DenseTensor* linear1_out,
+           phi::DenseTensor* ln1_out,
+           phi::DenseTensor* dropout1_out,
+           phi::DenseTensor* dropout2_out,
            const int bsz_seq,
            const int d_model,
            const int dim_feedforward,
@@ -255,41 +253,41 @@ class FusedFeedForwardXPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto place = context.GetPlace();
 
-    auto* x = context.Input<Tensor>("X");
+    auto* x = context.Input<phi::DenseTensor>("X");
 
-    auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
-    auto* linear1_bias = context.Input<Tensor>("Linear1Bias");
-    auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
-    auto* linear2_bias = context.Input<Tensor>("Linear2Bias");
+    auto* linear1_weight = context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear1_bias = context.Input<phi::DenseTensor>("Linear1Bias");
+    auto* linear2_weight = context.Input<phi::DenseTensor>("Linear2Weight");
+    auto* linear2_bias = context.Input<phi::DenseTensor>("Linear2Bias");
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
 
-    const Tensor* ln_scale = nullptr;
-    const Tensor* ln_bias = nullptr;
-    Tensor* ln_mean = nullptr;
-    Tensor* ln_variance = nullptr;
-    Tensor* ln1_out = nullptr;
+    const phi::DenseTensor* ln_scale = nullptr;
+    const phi::DenseTensor* ln_bias = nullptr;
+    phi::DenseTensor* ln_mean = nullptr;
+    phi::DenseTensor* ln_variance = nullptr;
+    phi::DenseTensor* ln1_out = nullptr;
 
     if (pre_layer_norm) {
-      ln_scale = context.Input<Tensor>("Ln1Scale");
-      ln_bias = context.Input<Tensor>("Ln1Bias");
-      ln_mean = context.Output<Tensor>("Ln1Mean");
-      ln_variance = context.Output<Tensor>("Ln1Variance");
-      ln1_out = context.Output<Tensor>("Ln1Out");
+      ln_scale = context.Input<phi::DenseTensor>("Ln1Scale");
+      ln_bias = context.Input<phi::DenseTensor>("Ln1Bias");
+      ln_mean = context.Output<phi::DenseTensor>("Ln1Mean");
+      ln_variance = context.Output<phi::DenseTensor>("Ln1Variance");
+      ln1_out = context.Output<phi::DenseTensor>("Ln1Out");
       ln1_out->mutable_data<T>(place);
     } else {
-      ln_scale = context.Input<Tensor>("Ln2Scale");
-      ln_bias = context.Input<Tensor>("Ln2Bias");
-      ln_mean = context.Output<Tensor>("Ln2Mean");
-      ln_variance = context.Output<Tensor>("Ln2Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln2Scale");
+      ln_bias = context.Input<phi::DenseTensor>("Ln2Bias");
+      ln_mean = context.Output<phi::DenseTensor>("Ln2Mean");
+      ln_variance = context.Output<phi::DenseTensor>("Ln2Variance");
     }
 
-    auto* out = context.Output<Tensor>("Out");
-    auto* dropout1_mask = context.Output<Tensor>("Dropout1Mask");
-    auto* dropout2_mask = context.Output<Tensor>("Dropout2Mask");
-    auto* linear1_out = context.Output<Tensor>("Linear1Out");
+    auto* out = context.Output<phi::DenseTensor>("Out");
+    auto* dropout1_mask = context.Output<phi::DenseTensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Output<phi::DenseTensor>("Dropout2Mask");
+    auto* linear1_out = context.Output<phi::DenseTensor>("Linear1Out");
 
-    auto* dropout1_out = context.Output<Tensor>("Dropout1Out");
-    auto* dropout2_out = context.Output<Tensor>("Dropout2Out");
+    auto* dropout1_out = context.Output<phi::DenseTensor>("Dropout1Out");
+    auto* dropout2_out = context.Output<phi::DenseTensor>("Dropout2Out");
 
     const std::string act_method = context.Attr<std::string>("act_method");
 
@@ -356,26 +354,26 @@ class FusedFeedForwardGradXPUKernel : public framework::OpKernel<T> {
 
  public:
   void FFNGrad(const phi::XPUContext& dev_ctx,
-               const Tensor* d_out,
-               const Tensor* x,
-               const Tensor* dropout1_mask,
-               const Tensor* dropout2_mask,
-               const Tensor* linear1_out,
-               const Tensor* ln1_out,
-               const Tensor* dropout1_out,
-               const Tensor* dropout2_out,
-               const Tensor* linear1_weight,
-               const Tensor* linear2_weight,
-               const Tensor* ln_scale,
-               const Tensor* ln_mean,
-               const Tensor* ln_variance,
-               Tensor* d_x,
-               Tensor* d_linear1_weight,
-               Tensor* d_linear1_bias,
-               Tensor* d_linear2_weight,
-               Tensor* d_linear2_bias,
-               Tensor* d_ln_scale,
-               Tensor* d_ln_bias,
+               const phi::DenseTensor* d_out,
+               const phi::DenseTensor* x,
+               const phi::DenseTensor* dropout1_mask,
+               const phi::DenseTensor* dropout2_mask,
+               const phi::DenseTensor* linear1_out,
+               const phi::DenseTensor* ln1_out,
+               const phi::DenseTensor* dropout1_out,
+               const phi::DenseTensor* dropout2_out,
+               const phi::DenseTensor* linear1_weight,
+               const phi::DenseTensor* linear2_weight,
+               const phi::DenseTensor* ln_scale,
+               const phi::DenseTensor* ln_mean,
+               const phi::DenseTensor* ln_variance,
+               phi::DenseTensor* d_x,
+               phi::DenseTensor* d_linear1_weight,
+               phi::DenseTensor* d_linear1_bias,
+               phi::DenseTensor* d_linear2_weight,
+               phi::DenseTensor* d_linear2_bias,
+               phi::DenseTensor* d_ln_scale,
+               phi::DenseTensor* d_ln_bias,
                const int bsz_seq,
                const int d_model,
                const int dim_feedforward,
@@ -696,55 +694,61 @@ class FusedFeedForwardGradXPUKernel : public framework::OpKernel<T> {
     auto place = context.GetPlace();
     const bool pre_layer_norm = context.Attr<bool>("pre_layer_norm");
     // inputs
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x = context.Input<Tensor>("X");
+    auto* d_out =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
 
-    auto* dropout1_mask = context.Input<Tensor>("Dropout1Mask");
-    auto* dropout2_mask = context.Input<Tensor>("Dropout2Mask");
-    auto* linear1_out = context.Input<Tensor>("Linear1Out");
-    auto* ln1_out = pre_layer_norm ? context.Input<Tensor>("Ln1Out") : nullptr;
+    auto* dropout1_mask = context.Input<phi::DenseTensor>("Dropout1Mask");
+    auto* dropout2_mask = context.Input<phi::DenseTensor>("Dropout2Mask");
+    auto* linear1_out = context.Input<phi::DenseTensor>("Linear1Out");
+    auto* ln1_out =
+        pre_layer_norm ? context.Input<phi::DenseTensor>("Ln1Out") : nullptr;
 
-    auto* dropout1_out = context.Input<Tensor>("Dropout1Out");
-    auto* dropout2_out = context.Input<Tensor>("Dropout2Out");
-    auto* linear1_weight = context.Input<Tensor>("Linear1Weight");
-    auto* linear2_weight = context.Input<Tensor>("Linear2Weight");
+    auto* dropout1_out = context.Input<phi::DenseTensor>("Dropout1Out");
+    auto* dropout2_out = context.Input<phi::DenseTensor>("Dropout2Out");
+    auto* linear1_weight = context.Input<phi::DenseTensor>("Linear1Weight");
+    auto* linear2_weight = context.Input<phi::DenseTensor>("Linear2Weight");
 
-    const Tensor* ln_mean = nullptr;
-    const Tensor* ln_variance = nullptr;
-    const Tensor* ln_scale = nullptr;
+    const phi::DenseTensor* ln_mean = nullptr;
+    const phi::DenseTensor* ln_variance = nullptr;
+    const phi::DenseTensor* ln_scale = nullptr;
 
     if (pre_layer_norm) {
-      ln_mean = context.Input<Tensor>("Ln1Mean");
-      ln_variance = context.Input<Tensor>("Ln1Variance");
-      ln_scale = context.Input<Tensor>("Ln1Scale");
+      ln_mean = context.Input<phi::DenseTensor>("Ln1Mean");
+      ln_variance = context.Input<phi::DenseTensor>("Ln1Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln1Scale");
     } else {
-      ln_mean = context.Input<Tensor>("Ln2Mean");
-      ln_variance = context.Input<Tensor>("Ln2Variance");
-      ln_scale = context.Input<Tensor>("Ln2Scale");
+      ln_mean = context.Input<phi::DenseTensor>("Ln2Mean");
+      ln_variance = context.Input<phi::DenseTensor>("Ln2Variance");
+      ln_scale = context.Input<phi::DenseTensor>("Ln2Scale");
     }
 
     // output
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
 
-    Tensor* d_ln_scale = nullptr;
-    Tensor* d_ln_bias = nullptr;
+    phi::DenseTensor* d_ln_scale = nullptr;
+    phi::DenseTensor* d_ln_bias = nullptr;
 
     if (pre_layer_norm) {
-      d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln1Scale"));
-      d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln1Bias"));
+      d_ln_scale =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln1Scale"));
+      d_ln_bias =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln1Bias"));
     } else {
-      d_ln_scale = context.Output<Tensor>(framework::GradVarName("Ln2Scale"));
-      d_ln_bias = context.Output<Tensor>(framework::GradVarName("Ln2Bias"));
+      d_ln_scale =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln2Scale"));
+      d_ln_bias =
+          context.Output<phi::DenseTensor>(framework::GradVarName("Ln2Bias"));
     }
 
-    auto* d_linear1_weight =
-        context.Output<Tensor>(framework::GradVarName("Linear1Weight"));
+    auto* d_linear1_weight = context.Output<phi::DenseTensor>(
+        framework::GradVarName("Linear1Weight"));
     auto* d_linear1_bias =
-        context.Output<Tensor>(framework::GradVarName("Linear1Bias"));
-    auto* d_linear2_weight =
-        context.Output<Tensor>(framework::GradVarName("Linear2Weight"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear1Bias"));
+    auto* d_linear2_weight = context.Output<phi::DenseTensor>(
+        framework::GradVarName("Linear2Weight"));
     auto* d_linear2_bias =
-        context.Output<Tensor>(framework::GradVarName("Linear2Bias"));
+        context.Output<phi::DenseTensor>(framework::GradVarName("Linear2Bias"));
 
     float epsilon = 0.0f;
     if (pre_layer_norm) {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index d55d047009255..b7611eff765d2 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline std::string MemoryDebugString(const phi::DenseTensor& t) {
   int device_id = platform::GetCurrentDeviceId();
   int64_t allocated =
@@ -233,17 +231,17 @@ struct GateAttentionConfig {
   }
 
  protected:
-  Tensor qkv_out;
-  Tensor query_out;
-  Tensor key_out;
-  Tensor value_out;
+  phi::DenseTensor qkv_out;
+  phi::DenseTensor query_out;
+  phi::DenseTensor key_out;
+  phi::DenseTensor value_out;
   // qk_out = BatchedGEMM(Q, K^T)
   // qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
   // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
   // The shape of qk_out, softmax_out is the same, thus can be called inplace.
-  Tensor qk_out;
+  phi::DenseTensor qk_out;
   // qktv_out may reuse gate_out.
-  Tensor qktv_out;
+  phi::DenseTensor qktv_out;
 };
 
 template <typename T>
@@ -312,11 +310,11 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
   }
 
  protected:
-  Tensor qkv_out_grad;
-  Tensor query_out_grad;
-  Tensor key_out_grad;
-  Tensor value_out_grad;
-  Tensor qk_out_grad;
+  phi::DenseTensor qkv_out_grad;
+  phi::DenseTensor query_out_grad;
+  phi::DenseTensor key_out_grad;
+  phi::DenseTensor value_out_grad;
+  phi::DenseTensor qk_out_grad;
 };
 
 template <typename T>
@@ -461,10 +459,10 @@ class FMHAGateRef {
     T* k_grad_ptr = nullptr;
     T* v_grad_ptr = nullptr;
 
-    Tensor q_transpose_out_grad;
-    Tensor k_transpose_out_grad;
-    Tensor v_transpose_out_grad;
-    Tensor qkv_transpose_out_grad;
+    phi::DenseTensor q_transpose_out_grad;
+    phi::DenseTensor k_transpose_out_grad;
+    phi::DenseTensor v_transpose_out_grad;
+    phi::DenseTensor qkv_transpose_out_grad;
     if (merge_qkv_) {
       PADDLE_ENFORCE_NOT_NULL(
           qkv_transpose_out,
@@ -513,7 +511,7 @@ class FMHAGateRef {
                                      v_transpose_out_grad.numel() * sizeof(T));
     }
 
-    Tensor softmax_out_grad;
+    phi::DenseTensor softmax_out_grad;
     softmax_out_grad.Resize(config->softmax_out_dims);
     AllocWithDebugInfo<T>(dev_ctx_, "softmax_out_grad", &softmax_out_grad);
 
@@ -521,7 +519,7 @@ class FMHAGateRef {
         config->batch_size * config->seq_len_m * config->num_heads;
     {
       // Forward: fmha_out = transpose(qktv_out)
-      Tensor qktv_out_grad;
+      phi::DenseTensor qktv_out_grad;
       qktv_out_grad.Resize(config->qktv_out_dims);
       AllocWithDebugInfo<T>(dev_ctx_, "qktv_out_grad", &qktv_out_grad);
       ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index ce7929c39ffa8..c91bca47cf42f 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 class FusedGateAttentionOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 9cb3f19ab1740..8ca6cdb46ccd9 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct SigmoidMultiplyFunctor {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -64,8 +62,8 @@ struct SigmoidMultiplyGradFunctor {
 template <typename T>
 void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
                                    const GateAttentionConfig<T> &config,
-                                   const Tensor *query,
-                                   Tensor *qkv_out) {
+                                   const phi::DenseTensor *query,
+                                   phi::DenseTensor *qkv_out) {
   // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
   // qkv_weight: shape=[3, num_heads, head_dim, qkv_dim]
   // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim]
@@ -83,9 +81,9 @@ void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                     const GateAttentionGradConfig<T> &config,
-                                    const Tensor *query,
-                                    const Tensor *qkv_out_grad,
-                                    Tensor *query_grad,
+                                    const phi::DenseTensor *query,
+                                    const phi::DenseTensor *qkv_out_grad,
+                                    phi::DenseTensor *query_grad,
                                     bool use_addto) {
   auto *qkv_weight = ctx.Input<phi::DenseTensor>("QKVWeight");
   auto *qkv_weight_grad =
@@ -111,11 +109,11 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
                                       const GateAttentionConfig<T> &config,
-                                      const Tensor *query,
-                                      const Tensor *key,
-                                      Tensor *query_out,
-                                      Tensor *key_out,
-                                      Tensor *value_out) {
+                                      const phi::DenseTensor *query,
+                                      const phi::DenseTensor *key,
+                                      phi::DenseTensor *query_out,
+                                      phi::DenseTensor *key_out,
+                                      phi::DenseTensor *value_out) {
   auto *query_weight = ctx.Input<phi::DenseTensor>("QueryWeight");
   auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
   auto *value_weight = ctx.Input<phi::DenseTensor>("ValueWeight");
@@ -149,13 +147,13 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
                                        const GateAttentionGradConfig<T> &config,
-                                       const Tensor *query,
-                                       const Tensor *key,
-                                       const Tensor *query_out_grad,
-                                       const Tensor *key_out_grad,
-                                       const Tensor *value_out_grad,
-                                       Tensor *query_grad,
-                                       Tensor *key_grad,
+                                       const phi::DenseTensor *query,
+                                       const phi::DenseTensor *key,
+                                       const phi::DenseTensor *query_out_grad,
+                                       const phi::DenseTensor *key_out_grad,
+                                       const phi::DenseTensor *value_out_grad,
+                                       phi::DenseTensor *query_grad,
+                                       phi::DenseTensor *key_grad,
                                        bool use_addto) {
   // Gradient of GEMM(key, k_weight)
   const auto *key_weight = ctx.Input<phi::DenseTensor>("KeyWeight");
@@ -209,9 +207,9 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
                                 const GateAttentionConfig<T> &config,
-                                const Tensor *query,
-                                const Tensor *fmha_out,
-                                Tensor *gate_out) {
+                                const phi::DenseTensor *query,
+                                const phi::DenseTensor *fmha_out,
+                                phi::DenseTensor *gate_out) {
   auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
   auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
 
@@ -228,8 +226,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
       gate_weight, query, gate_bias, gate_out, gate_out);
 
   // gate_out = sigmoid(gate_out) * fmha_out
-  std::vector<const Tensor *> ins = {gate_out, fmha_out};
-  std::vector<Tensor *> outs = {gate_out};
+  std::vector<const phi::DenseTensor *> ins = {gate_out, fmha_out};
+  std::vector<phi::DenseTensor *> outs = {gate_out};
   phi::funcs::ElementwiseKernel<T>(
       ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyFunctor<T>());
 }
@@ -237,16 +235,16 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
                                  const GateAttentionGradConfig<T> &config,
-                                 const Tensor *query,
-                                 const Tensor *fmha_out,
-                                 const Tensor *gate_out_grad,
-                                 Tensor *query_grad,
-                                 Tensor *fmha_out_grad) {
+                                 const phi::DenseTensor *query,
+                                 const phi::DenseTensor *fmha_out,
+                                 const phi::DenseTensor *gate_out_grad,
+                                 phi::DenseTensor *query_grad,
+                                 phi::DenseTensor *fmha_out_grad) {
   const auto *gate_weight = ctx.Input<phi::DenseTensor>("GateWeight");
   const auto *gate_bias = ctx.Input<phi::DenseTensor>("GateBias");
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   // Re-compute gate_bias_out
-  Tensor gate_bias_out;
+  phi::DenseTensor gate_bias_out;
   gate_bias_out.Resize(config.gate_out_dims);
   dev_ctx.Alloc<T>(&gate_bias_out, gate_bias_out.numel() * sizeof(T));
 
@@ -260,8 +258,9 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
 
   // Gradient of sigmoid(gate_bias_out) * fmha_out
   // Compute inplace and save gate_bias_out_grad to gate_bias_out.
-  std::vector<const Tensor *> ins = {gate_out_grad, &gate_bias_out, fmha_out};
-  std::vector<Tensor *> outs = {&gate_bias_out, fmha_out_grad};
+  std::vector<const phi::DenseTensor *> ins = {
+      gate_out_grad, &gate_bias_out, fmha_out};
+  std::vector<phi::DenseTensor *> outs = {&gate_bias_out, fmha_out_grad};
   phi::funcs::ElementwiseKernel<T, SigmoidMultiplyGradFunctor<T>, 2>(
       ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyGradFunctor<T>());
 
@@ -284,8 +283,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
                                 const GateAttentionConfig<T> &config,
-                                const Tensor *fmha_or_gate_out,
-                                Tensor *out) {
+                                const phi::DenseTensor *fmha_or_gate_out,
+                                phi::DenseTensor *out) {
   const auto *out_linear_weight =
       ctx.Input<phi::DenseTensor>("OutLinearWeight");
   const auto *out_linear_bias = ctx.Input<phi::DenseTensor>("OutLinearBias");
@@ -303,8 +302,8 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
 template <typename T>
 void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
                                  const GateAttentionGradConfig<T> &config,
-                                 const Tensor *input,
-                                 Tensor *input_grad) {
+                                 const phi::DenseTensor *input,
+                                 phi::DenseTensor *input_grad) {
   auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   const auto *out_grad =
       ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -382,15 +381,15 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
               query));
 
       // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
-      Tensor *qkv_out = config.GetQKVOut();
+      phi::DenseTensor *qkv_out = config.GetQKVOut();
       ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
 
       AllocWithDebugInfo<T>(dev_ctx, "qkv_transpose_out", qkv_transpose_out);
     } else {
       // 1. Separated QKV Matmul
-      Tensor *query_out = config.GetQueryOut();
-      Tensor *key_out = config.GetKeyOut();
-      Tensor *value_out = config.GetValueOut();
+      phi::DenseTensor *query_out = config.GetQueryOut();
+      phi::DenseTensor *key_out = config.GetKeyOut();
+      phi::DenseTensor *value_out = config.GetValueOut();
       ComputeSeparatedQKVMatmulForward<T>(
           ctx, config, query, key, query_out, key_out, value_out);
 
@@ -418,7 +417,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
     }
 
     // 4. Output Linear
-    Tensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
+    phi::DenseTensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
     ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out);
   }
 };
@@ -461,12 +460,12 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     GateAttentionGradConfig<T> config(
         dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating);
 
-    Tensor fmha_out_grad;
+    phi::DenseTensor fmha_out_grad;
     fmha_out_grad.Resize(config.gate_out_dims);
     AllocWithDebugInfo<T>(dev_ctx, "fmha_out_grad", &fmha_out_grad);
     if (has_gating) {
       // 1. Gradient of Output Linear: out = Linear(gate_out)
-      Tensor gate_out_grad;
+      phi::DenseTensor gate_out_grad;
       gate_out_grad.Resize(config.gate_out_dims);
       AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
       ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad);
@@ -505,7 +504,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     bool use_addto = has_gating ? true : false;
     if (merge_qkv) {
       // 4. Gradient of Merged QKV Matmul
-      Tensor *qkv_out_grad = config.GetQKVOutGrad();
+      phi::DenseTensor *qkv_out_grad = config.GetQKVOutGrad();
       ComputeMergedQKVMatmulBackward<T>(
           ctx, config, query, qkv_out_grad, query_grad, use_addto);
     } else {
@@ -515,9 +514,9 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
       if (key_grad) {
         AllocWithDebugInfo<T>(dev_ctx, "key_grad", key_grad);
       }
-      Tensor *query_out_grad = config.GetQueryOutGrad();
-      Tensor *key_out_grad = config.GetKeyOutGrad();
-      Tensor *value_out_grad = config.GetValueOutGrad();
+      phi::DenseTensor *query_out_grad = config.GetQueryOutGrad();
+      phi::DenseTensor *key_out_grad = config.GetKeyOutGrad();
+      phi::DenseTensor *value_out_grad = config.GetValueOutGrad();
       ComputeSeparatedQKVMatmulBackward<T>(ctx,
                                            config,
                                            query,
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index 1f9cbf320fb50..013593176aa2d 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index e5bab3cae4fab..05beddc52211b 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
index b1707ff55950d..687ce97068a35 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
index 3a9bd15c101e9..e1be5afa0bd68 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
  private:
   static constexpr const char *OpName = "FusedMultiTransformerINT8Op";
@@ -176,7 +174,7 @@ class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "TimeStep") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
index fa22ee8d57e65..a4c11b85b9eeb 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -62,7 +62,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 
     auto ln_compute =
         AttnLayerNorm<T, T, int8_t>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -86,7 +86,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // (transA, transB, compute_bias) = (false, trans_qkvw, false)
     AttnMatmulINT8<T> qkv_compute(
         dev_ctx, bsz_seq, output_size, input_size, compute_bias);
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -123,7 +123,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
       out_seq_len += time_step_value;
     }
 
-    Tensor transpose_out_2, qk_out;
+    phi::DenseTensor transpose_out_2, qk_out;
     transpose_out_2.Resize({{3, bsz, num_head, seq_len, dim_head}});
     auto *transpose_out_2_data =
         dev_ctx.Alloc<T>(&transpose_out_2, transpose_out_2.numel() * sizeof(T));
@@ -131,9 +131,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -170,7 +170,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
             dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -190,7 +190,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     int dim_ffn = ffn1_weight_dim[0];
     AttnMatmulINT8<T> ffn1_linear_compute(
         dev_ctx, bsz_seq, dim_ffn, dim_embed, false);
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -201,7 +201,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper_for_post_layernorm(
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
-    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    phi::DenseTensor ffn1_dropout_out, ffn1_dropout_mask;
     ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_dropout_out_data = dev_ctx.Alloc<T>(
         &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T));
@@ -228,7 +228,7 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
             dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
 
     // []. init workspace for cublasLt transform
-    Tensor input_workspace, output_workspace, cublaslt_workspace;
+    phi::DenseTensor input_workspace, output_workspace, cublaslt_workspace;
     // for input and output transform data is CUBLASLT_ORDER_COL32 format,
     int m_max = bsz_seq, k_max = std::max(dim_embed, dim_ffn),
         n_max = std::max({output_size, dim_embed, dim_ffn});
@@ -248,15 +248,15 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -293,9 +293,10 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(qkv_weights[i],
                                    input_x,
@@ -337,8 +338,9 @@ class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index 94a89338a6205..92b782c44c77a 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusedMultiTransformerOp : public framework::OperatorWithKernel {
  private:
   static constexpr const char *OpName = "FusedMultiTransformerOp";
@@ -143,7 +141,7 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "TimeStep") {
       VLOG(10) << "var_name:" << var_name << " need not to transform";
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index aeb00a7947cd6..5ca66cb132b05 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -40,7 +40,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -72,7 +72,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                                      input_size,
                                      /*compute_bias=*/false);
 
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -116,7 +116,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
       out_seq_len += cache_offset;
     }
 
-    Tensor q_transpose_out, kv_transpose_out, qk_out;
+    phi::DenseTensor q_transpose_out, kv_transpose_out, qk_out;
     q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}});
     auto *q_transpose_out_data =
         dev_ctx.Alloc<T>(&q_transpose_out, q_transpose_out.numel() * sizeof(T));
@@ -128,7 +128,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor src_mask_out;
+    phi::DenseTensor src_mask_out;
     if (cache_offset > 0) {
       src_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
       auto *src_mask_out_data =
@@ -136,7 +136,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     }
 
     // [2, bs, num_head, cache_seq_len + seq_len, head_dim]
-    Tensor pre_cache_kv_out;
+    phi::DenseTensor pre_cache_kv_out;
     if (cache_offset > 0) {
       pre_cache_kv_out.Resize(
           {{2, bsz, num_head, seq_len + cache_offset, dim_head}});
@@ -144,9 +144,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
           &pre_cache_kv_out, pre_cache_kv_out.numel() * sizeof(T));
     }
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -179,7 +179,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -202,7 +202,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     const phi::DDim ffn1_input_shape({bsz_seq, dim_embed});
     ffn1_cublas_linear.Setup(ffn1_input_shape, ffn1_weight_dim, false, false);
 
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -223,15 +223,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -270,9 +270,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(
             qkv_weights[i], input_x, bias, &qkv_out, &qkv_out);
@@ -285,8 +286,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
@@ -304,11 +306,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 time_step->data<int>()[0],
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
-        const Tensor *pre_cache_kv_tensor =
+        const phi::DenseTensor *pre_cache_kv_tensor =
             pre_caches.size() > 0 ? pre_caches[i] : nullptr;
-        Tensor *pre_cache_kv_out_tmp =
+        phi::DenseTensor *pre_cache_kv_out_tmp =
             cache_offset > 0 ? &pre_cache_kv_out : nullptr;
-        Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr;
+        phi::DenseTensor *src_mask_tmp =
+            cache_offset > 0 ? &src_mask_out : nullptr;
         qkv_bias_add_transpose_split<T>(dev_ctx,
                                         q_transpose_out_data,
                                         kv_transpose_out_data,
@@ -554,7 +557,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ln_biases = ctx.MultiInput<phi::DenseTensor>("LnBias");
 
     auto ln_compute = AttnLayerNorm<T>(dev_ctx, epsilon, bsz_seq, dim_embed);
-    Tensor ln_mean, ln_var;
+    phi::DenseTensor ln_mean, ln_var;
     ln_mean.Resize({{bsz_seq}});
     auto *ln_mean_data =
         dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
@@ -586,7 +589,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                                      input_size,
                                      /*compute_bias=*/false);
 
-    Tensor qkv_out;
+    phi::DenseTensor qkv_out;
     qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
     auto *qkv_out_data =
         dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
@@ -630,7 +633,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
       out_seq_len += cache_offset;
     }
 
-    Tensor q_transpose_out, kv_transpose_out, qk_out;
+    phi::DenseTensor q_transpose_out, kv_transpose_out, qk_out;
     q_transpose_out.Resize({{bsz, num_head, seq_len, dim_head}});
     auto *q_transpose_out_data =
         dev_ctx.Alloc<T>(&q_transpose_out, q_transpose_out.numel() * sizeof(T));
@@ -642,7 +645,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
 
-    Tensor src_mask_out;
+    phi::DenseTensor src_mask_out;
     if (cache_offset > 0) {
       src_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
       auto *src_mask_out_data =
@@ -650,7 +653,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     }
 
     // [2, bs, num_head, cache_seq_len + seq_len, head_dim]
-    Tensor pre_cache_kv_out;
+    phi::DenseTensor pre_cache_kv_out;
     if (cache_offset > 0) {
       pre_cache_kv_out.Resize(
           {{2, bsz, num_head, seq_len + cache_offset, dim_head}});
@@ -658,9 +661,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
           &pre_cache_kv_out, pre_cache_kv_out.numel() * sizeof(T));
     }
 
-    Tensor softmax_out;
-    Tensor attn_dropout_mask_out, attn_dropout_out;
-    Tensor qktv_out, fmha_out;
+    phi::DenseTensor softmax_out;
+    phi::DenseTensor attn_dropout_mask_out, attn_dropout_out;
+    phi::DenseTensor qktv_out, fmha_out;
     softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
     auto *softmax_out_data =
         dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
@@ -693,7 +696,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
         dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
     auto ffn_ln_scales = ctx.MultiInput<phi::DenseTensor>("FFNLnScale");
     auto ffn_ln_biases = ctx.MultiInput<phi::DenseTensor>("FFNLnBias");
-    Tensor bias_dropout_residual_out, dropout_mask_out;
+    phi::DenseTensor bias_dropout_residual_out, dropout_mask_out;
     T *bias_dropout_residual_out_data = nullptr;
     if (pre_layer_norm) {
       bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
@@ -713,7 +716,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     int dim_ffn = ffn1_weight_dim[1];
     auto ffn1_linear_compute = AttnMatMul<T>(
         dev_ctx, false, false, bsz_seq, dim_ffn, dim_embed, false);
-    Tensor ffn1_out;
+    phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_out_data =
         dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
@@ -722,7 +725,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
     FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper(
         dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
-    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    phi::DenseTensor ffn1_dropout_out, ffn1_dropout_mask;
     ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}});
     auto *ffn1_dropout_out_data = dev_ctx.Alloc<T>(
         &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T));
@@ -744,15 +747,15 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // calc
     auto *out = ctx.Output<phi::DenseTensor>("Out");
     auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
-    Tensor *from_tensor = out;
-    Tensor tmp_out;
+    phi::DenseTensor *from_tensor = out;
+    phi::DenseTensor tmp_out;
     tmp_out.Resize({{bsz, seq_len, dim_embed}});
     auto *tmp_out_data =
         dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
 
     auto *x_data = input_x->data<T>();
-    Tensor *buf0 = nullptr;
-    Tensor *buf1 = nullptr;
+    phi::DenseTensor *buf0 = nullptr;
+    phi::DenseTensor *buf1 = nullptr;
 
     // step0:  x   --> buf1
     // step1: buf1 --> buf0
@@ -791,9 +794,10 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step2. qkv
-      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      const phi::DenseTensor *qkv_bias =
+          qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
       // NOTE: in decoder stage, bias is fused in fmha
-      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      const phi::DenseTensor *bias = time_step ? nullptr : qkv_bias;
       if (!pre_layer_norm && i == 0) {
         qkv_compute.ComputeForward(
             qkv_weights[i], input_x, bias, &qkv_out, &qkv_out);
@@ -806,8 +810,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       // step3. fmha
-      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
-      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+      const phi::DenseTensor *cache_kv =
+          cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      phi::DenseTensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
 
       if (time_step) {  // generation decoder stage
         // [2, batch_size, num_head, max_seq_len, head_size]
@@ -825,11 +830,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
                 time_step->data<int>()[0],
                 1. / sqrt(dim_head));
       } else if (cache_kv_out) {  // generation context stage
-        const Tensor *pre_cache_kv_tensor =
+        const phi::DenseTensor *pre_cache_kv_tensor =
             pre_caches.size() > 0 ? pre_caches[i] : nullptr;
-        Tensor *pre_cache_kv_out_tmp =
+        phi::DenseTensor *pre_cache_kv_out_tmp =
             cache_offset > 0 ? &pre_cache_kv_out : nullptr;
-        Tensor *src_mask_tmp = cache_offset > 0 ? &src_mask_out : nullptr;
+        phi::DenseTensor *src_mask_tmp =
+            cache_offset > 0 ? &src_mask_out : nullptr;
         qkv_bias_add_transpose_split<T>(dev_ctx,
                                         q_transpose_out_data,
                                         kv_transpose_out_data,
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 69ac06206c62b..0500f76110f33 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -44,8 +44,6 @@ DECLARE_bool(gemm_use_half_precision_compute_type);
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // for debug
 // #define _DEBUG_FUSED_MULTI_TRANSFORMER
 
@@ -1119,11 +1117,11 @@ void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
 
 template <typename T>
 void fmha(const phi::GPUContext &dev_ctx,
-          const Tensor &qkv_tensor,
-          const Tensor &qkv_bias_tensor,
-          const Tensor &src_mask_tensor,
-          Tensor *cache_kv_tensor,
-          Tensor *out_tensor,
+          const phi::DenseTensor &qkv_tensor,
+          const phi::DenseTensor &qkv_bias_tensor,
+          const phi::DenseTensor &src_mask_tensor,
+          phi::DenseTensor *cache_kv_tensor,
+          phi::DenseTensor *out_tensor,
           int batch_size,
           int max_seq_length,
           int num_head,
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 07cfb44a312bc..519ce1c6aca08 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -20,7 +20,6 @@ namespace paddle {
 namespace operators {
 
 #if CUDNN_VERSION >= 7100
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 814631bd87b47..fc7804f9c4e8c 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -160,26 +160,29 @@ void FusionGRUOpMaker::Make() {
       "variable-time length input sequence. The underlying tensor in "
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
-  AddInput("H0",
-           "(Tensor, optional) The initial hidden state is an optional "
-           "input. This is a tensor with shape (N x D), where N is the "
-           "batch size, D is the hidden size.")
+  AddInput(
+      "H0",
+      "(phi::DenseTensor, optional) The initial hidden state is an optional "
+      "input. This is a tensor with shape (N x D), where N is the "
+      "batch size, D is the hidden size.")
       .AsDispensable();
   AddInput("WeightX",
-           "(Tensor) The FC weight with shape (M x 3D),"
+           "(phi::DenseTensor) The FC weight with shape (M x 3D),"
            "where M is the dim size of x, D is the hidden size. ");
-  AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
-           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
-           "Acutally they are D x 2D and D x D two part weights."
-           "{W_update, W_reset; W_state}"
-           "{D x (D + D); D x D}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+      "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+      "Acutally they are D x 2D and D x D two part weights."
+      "{W_update, W_reset; W_state}"
+      "{D x (D + D); D x D}");
   AddInput("Bias",
-           "(Tensor, optional) (1 x 3D)."
+           "(phi::DenseTensor, optional) (1 x 3D)."
            "Almost same as GRUOp."
            "Note: if have FC bias it should be added on this bias.")
       .AsDispensable();
-  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+  AddOutput("ReorderedH0",
+            "(phi::DenseTensor) (N x D), which N is the min-batch size.")
       .AsIntermediate();
   AddOutput("XX",
             "(phi::DenseTensor) the result after X * WeightX (size is T x 3D)"
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.h b/paddle/fluid/operators/fused/fusion_gru_op.h
index 4df5042089053..94bf38068d0dd 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.h
+++ b/paddle/fluid/operators/fused/fusion_gru_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionGRUOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index b612d590ea1d9..c526fdc18428c 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -184,16 +184,17 @@ void FusionLSTMOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   AddInput("WeightX",
-           "(Tensor) the learnable weights of X."
+           "(phi::DenseTensor) the learnable weights of X."
            " - The shape is (M x 4D), where M is the dim size of x, D is the "
            "hidden size. "
            " - Weight = {W_cx, W_ix, W_fx, W_ox}");
-  AddInput("WeightH",
-           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
-           " - The shape is (D x 4D), where D is the hidden size. "
-           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput(
+      "WeightH",
+      "(phi::DenseTensor) same as LSTMOp, the learnable hidden-hidden weights."
+      " - The shape is (D x 4D), where D is the hidden size. "
+      " - Weight = {W_ch, W_ih, W_fh, W_oh}");
   AddInput("Bias",
-           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "(phi::DenseTensor) the learnable weights. Almost same as LSTMOp"
            "Note: we should add the fc bias into this (1x4D) in bias."
            "input-hidden bias weight and peephole connections weight if "
            "setting `use_peepholes` True. "
@@ -204,13 +205,15 @@ void FusionLSTMOpMaker::Make() {
            " - The shape is (1 x 7D). "
            " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
   AddInput("H0",
-           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) the initial hidden "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size and D is the hidden size.")
       .AsDispensable();
   AddInput("C0",
-           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "(phi::DenseTensor, optional) (same as LSTMOp) (the initial cell "
+           "state is an "
            "optional "
            "input. This is a tensor with shape (N x D), where N is the "
            "batch size. `H0` and `C0` can be NULL but only at the same time.")
@@ -234,7 +237,7 @@ void FusionLSTMOpMaker::Make() {
   AddOutput("BatchedCell", "(phi::DenseTensor) (T x D).").AsIntermediate();
   AddOutput("ReorderedH0", "(phi::DenseTensor) (N x D).").AsIntermediate();
   AddOutput("ReorderedC0", "(phi::DenseTensor) (N x D).").AsIntermediate();
-  AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.")
+  AddOutput("CheckedCell", "(phi::DenseTensor) (2 x D) only for peephole.")
       .AsIntermediate();
   AddAttr<bool>("use_peepholes",
                 "(bool, default: True) "
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.h b/paddle/fluid/operators/fused/fusion_lstm_op.h
index 590d4bd7c2914..93f8eb981bbd9 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionLSTMOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index 7bad7c78edc75..bab06f55be856 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -107,10 +107,12 @@ framework::OpKernelType FusionRepeatedFCReluOp::GetExpectedKernelType(
 
 void FusionRepeatedFCReluOpMaker::Make() {
   AddInput("X", "(phi::DenseTensor) Input tensors of this operator.");
-  AddInput("W", "(Tensor) The weight tensors of this operator.").AsDuplicable();
-  AddInput("Bias", "(Tensor) The bias tensors of this operator.")
+  AddInput("W", "(phi::DenseTensor) The weight tensors of this operator.")
       .AsDuplicable();
-  AddOutput("ReluOut", "(Tensor) The output tensor of each relu operator.")
+  AddInput("Bias", "(phi::DenseTensor) The bias tensors of this operator.")
+      .AsDuplicable();
+  AddOutput("ReluOut",
+            "(phi::DenseTensor) The output tensor of each relu operator.")
       .AsDuplicable()
       .AsIntermediate();
   AddOutput("Out", "(phi::DenseTensor) Output tensor of this operator.");
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
index 2cfb404913c42..16025bf5181b6 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionRepeatedFCReluOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index cb08e4fbff258..c9166919636bf 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -102,14 +102,16 @@ void FusionSeqConvEltAddReluOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T X M), where T is the "
       "total time steps in this mini-batch, M is the dim size of x.");
   // PaddingData only support false yet, should be ensured at pass.
-  AddInput("Filter",
-           "(Tensor) same as the input(Filter) of sequence conv op is an "
-           "learnable parameter."
-           "This is a tensor with shape (K, N), where K is the "
-           "context_length * dim size of x, N is the output feature size.");
-  AddInput("Bias",
-           "(Tensor) the learnable weights. shape (1, N), where N is the "
-           "output feature size");
+  AddInput(
+      "Filter",
+      "(phi::DenseTensor) same as the input(Filter) of sequence conv op is an "
+      "learnable parameter."
+      "This is a tensor with shape (K, N), where K is the "
+      "context_length * dim size of x, N is the output feature size.");
+  AddInput(
+      "Bias",
+      "(phi::DenseTensor) the learnable weights. shape (1, N), where N is the "
+      "output feature size");
   AddOutput(
       "Out",
       "(phi::DenseTensor) the output(Out) is a LodTensor, which support "
@@ -117,7 +119,7 @@ void FusionSeqConvEltAddReluOpMaker::Make() {
       "this phi::DenseTensor is a matrix with shape (T, N), where, T is the "
       "total time steps in this mini-batch, N is the output feature size.");
   AddOutput("ColMat",
-            "(Tensor) (T, K), where T is where T is the "
+            "(phi::DenseTensor) (T, K), where T is where T is the "
             "total time steps in this mini-batch, K is height of Filter")
       .AsIntermediate();
   AddAttr<int>("contextLength",
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
index d1b7ae835821f..96f231f9a3cd5 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqConvEltAddReluOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
index bcc8ee894543f..df4cbba1dec15 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -114,12 +114,13 @@ void FusionSeqExpandConcatFCOpMaker::Make() {
            "ref lod "
            "for sequence expand, and the rest input should have same lod.")
       .AsDuplicable();
-  AddInput("FCWeight", "(Tensor) the weights of fc.");
-  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
+  AddInput("FCWeight", "(phi::DenseTensor) the weights of fc.");
+  AddInput("FCBias", "(phi::DenseTensor, optional) the bias of fc.")
+      .AsDispensable();
   AddOutput("Out", "(phi::DenseTensor) Output LodTensor.");
   AddOutput(
       "FCOut",
-      "(Tensor) the intermediate tensor to keep the result of fc."
+      "(phi::DenseTensor) the intermediate tensor to keep the result of fc."
       "Shape is (N x D), where N is the batch size, D is the output dim of fc")
       .AsIntermediate();
   AddAttr<std::string>("fc_activation",
diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
index 9c611025351e8..495de5f233445 100644
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqExpandConcatFCOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
index 6dc29b23cbb89..2e2d6e07dc7e5 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqPoolConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 41944f4bc095f..e3953f9e6abc0 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -77,7 +77,8 @@ void FusionSeqPoolCVMConcatOpMaker::Make() {
   AddInput("X", "(phi::DenseTensor) Input tensors of this operator.")
       .AsDuplicable();
   AddInput("CVM",
-           "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+           "(phi::DenseTensor),  a 2-D phi::DenseTensor with shape [N x 2], "
+           "where N is the batch "
            "size, 2 is show and click.");
   AddOutput("Out", "(phi::DenseTensor) Output tensor of concat operator.");
   AddAttr<std::string>("pooltype",
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
index 24a02553044b0..b9d7d0dfc340e 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index b7a01b7955887..8d7f792f3c25b 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -70,12 +70,12 @@ framework::OpKernelType FusionSquaredMatSubOp::GetExpectedKernelType(
 }
 
 void FusionSquaredMatSubOpMaker::Make() {
-  AddInput("X", "(Tensor) Input Mat A of this operator.");
-  AddInput("Y", "(Tensor) Input Mat B of this operator.");
-  AddOutput("SquaredX", "(Tensor) Squared X.").AsIntermediate();
-  AddOutput("SquaredY", "(Tensor) Squared Y.").AsIntermediate();
-  AddOutput("SquaredXY", "(Tensor) Squared X*Y.").AsIntermediate();
-  AddOutput("Out", "(Tensor) Output tensor of concat operator.");
+  AddInput("X", "(phi::DenseTensor) Input Mat A of this operator.");
+  AddInput("Y", "(phi::DenseTensor) Input Mat B of this operator.");
+  AddOutput("SquaredX", "(phi::DenseTensor) Squared X.").AsIntermediate();
+  AddOutput("SquaredY", "(phi::DenseTensor) Squared Y.").AsIntermediate();
+  AddOutput("SquaredXY", "(phi::DenseTensor) Squared X*Y.").AsIntermediate();
+  AddOutput("Out", "(phi::DenseTensor) Output tensor of concat operator.");
   AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
   AddComment(R"DOC(
     Fusion Squared Matrix and substrct operator.
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
index 7707bb14fcefe..fc6a54fd9eb03 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
 class FusionSquaredMatSubOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 2e8b6f7d0a6b8..ba2b71ff6ffd7 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -273,7 +273,6 @@ template <typename DeviceContext, typename T>
 class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto *input = context.Input<phi::DenseTensor>("Input");
     auto *w = context.Input<phi::DenseTensor>("W");
     auto *bias = context.Input<phi::DenseTensor>("Bias");
@@ -296,7 +295,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     int batch = input_dims[0];
     int seq_len = input_dims[1];
     int hidden = input_dims[2];
-    Tensor temp_bias_tensor;
+    phi::DenseTensor temp_bias_tensor;
     // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
     if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
       VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]";
@@ -343,13 +342,13 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
         device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
     // (B*S, hidden)
-    const Tensor input_matrix =
+    const phi::DenseTensor input_matrix =
         framework::ReshapeToMatrix(*input, 2 /*x_num_col_dims */);
     // (hidden, 3 * all_head_size)
-    const Tensor w_matrix =
+    const phi::DenseTensor w_matrix =
         framework::ReshapeToMatrix(*w, 1 /*y_num_col_dims*/);
 
-    Tensor temp_out_tensor;
+    phi::DenseTensor temp_out_tensor;
     auto temp_out_dims =
         phi::make_ddim({batch, seq_len, 3, head_number, head_size});
     temp_out_tensor.Resize(
@@ -364,7 +363,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     VLOG(2) << temp_out_tensor;
     // temp_out_tensor.Resize(temp_out_dims);
 
-    Tensor multihead_temp_tensor;
+    phi::DenseTensor multihead_temp_tensor;
     // B * head_number * S * S * 1 + B * S * 3 * N * H
     int scratch_size = batch * head_number * seq_len * seq_len * 1;
     multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
index 76f173c2d6d09..b449ca3bbe8da 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class ResNetBasicBlockOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
index 8310116849611..f6b2d30453f42 100644
--- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc
@@ -21,8 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ResnetBasicBlockAttr {
  public:
   explicit ResnetBasicBlockAttr(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index b2d44057365b9..4b46dc76b260e 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 // Shape of bitmask
 static framework::DDim GetBitmaskDims(std::vector<int> out_shape) {
   int c = out_shape.back();
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index 02bde0ef04ff2..446d289a1b959 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ResNetUnitKernel : public framework::OpKernel<T> {
  public:
@@ -39,20 +37,23 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                           "ResNetUnitOp only supports float16 for now."));
 
     // input x
-    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
     // norm conv
-    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
+    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
     // bn finalize
-    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
+    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    phi::DenseTensor *saved_invstd_x =
+        ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    phi::DenseTensor *running_mean_x =
+        ctx.Output<phi::DenseTensor>("RunningMeanX");
+    phi::DenseTensor *running_var_x =
+        ctx.Output<phi::DenseTensor>("RunningVarX");
     // sbar
-    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
-    Tensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
+    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
+    phi::DenseTensor *bitmask = ctx.Output<phi::DenseTensor>("BitMask");
     // attrs
     int padding = ctx.Attr<int>("padding");
     int stride = ctx.Attr<int>("stride");
@@ -93,8 +94,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // 1. Conv
-    Tensor sum_x;
-    Tensor sum_of_squares_x;
+    phi::DenseTensor sum_x;
+    phi::DenseTensor sum_of_squares_x;
     sum_x.Resize(param_dims);
     sum_of_squares_x.Resize(param_dims);
     CudnnNormConvolution<T> conv_x_op(dev_ctx,
@@ -109,8 +110,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
         dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, &sum_of_squares_x);
 
     // 2. BN
-    Tensor equiv_scale_x;
-    Tensor equiv_bias_x;
+    phi::DenseTensor equiv_scale_x;
+    phi::DenseTensor equiv_bias_x;
     equiv_scale_x.Resize(param_dims);
     equiv_bias_x.Resize(param_dims);
     CudnnBNStatsFinalize<T> bn_x_op(dev_ctx, param_shape);
@@ -140,24 +141,28 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                                      bitmask_shape);
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
       // norm conv
-      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
+      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
       // bn finalize
-      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
+      phi::DenseTensor *saved_mean_z =
+          ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      phi::DenseTensor *saved_invstd_z =
+          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      phi::DenseTensor *running_mean_z =
+          ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      phi::DenseTensor *running_var_z =
+          ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       auto input_z_shape = phi::vectorize<int>(input_z->dims());
       auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
 
       // 3.1 Conv for second input
-      Tensor sum_z;
-      Tensor sum_of_squares_z;
+      phi::DenseTensor sum_z;
+      phi::DenseTensor sum_of_squares_z;
       sum_z.Resize(param_dims);
       sum_of_squares_z.Resize(param_dims);
       CudnnNormConvolution<T> conv_z_op(dev_ctx,
@@ -172,8 +177,8 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
           dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, &sum_of_squares_z);
 
       // 3.2 BN for second input
-      Tensor equiv_scale_z;
-      Tensor equiv_bias_z;
+      phi::DenseTensor equiv_scale_z;
+      phi::DenseTensor equiv_bias_z;
       equiv_scale_z.Resize(param_dims);
       equiv_bias_z.Resize(param_dims);
       CudnnBNStatsFinalize<T> bn_z_op(dev_ctx, param_shape);
@@ -203,7 +208,7 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                       output,
                       bitmask);
     } else {
-      const Tensor *input_z =
+      const phi::DenseTensor *input_z =
           fuse_add ? ctx.Input<phi::DenseTensor>("Z") : nullptr;
       sbar_op.Forward(dev_ctx,
                       *conv_out_x,
@@ -231,26 +236,29 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                       platform::errors::Unavailable(
                           "ResNetUnitOp only supports float16 for now."));
 
-    const Tensor *y_grad =
+    const phi::DenseTensor *y_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
 
-    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
-    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
-
-    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
-    const Tensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
-
-    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    Tensor *filter_x_grad =
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *saved_mean_x =
+        ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const phi::DenseTensor *saved_invstd_x =
+        ctx.Input<phi::DenseTensor>("SavedInvstdX");
+
+    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
+    const phi::DenseTensor *bitmask = ctx.Input<phi::DenseTensor>("BitMask");
+
+    phi::DenseTensor *x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *filter_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad =
+    phi::DenseTensor *scale_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad =
+    phi::DenseTensor *bias_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
@@ -276,7 +284,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
 
     // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
     // scale_x_grad, bias_x_grad
-    Tensor conv_out_x_grad;
+    phi::DenseTensor conv_out_x_grad;
     conv_out_x_grad.Resize(conv_out_x->dims());
     CudnnScaleBiasAddRelu<T> sbar_x_op(dev_ctx,
                                        act_type,
@@ -295,27 +303,28 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z =
+      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+      const phi::DenseTensor *saved_mean_z =
+          ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const phi::DenseTensor *saved_invstd_z =
           ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
 
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      Tensor *filter_z_grad =
+      phi::DenseTensor *filter_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      Tensor *scale_z_grad =
+      phi::DenseTensor *scale_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad =
+      phi::DenseTensor *bias_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
 
       // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad and z_grad_temp
-      Tensor z_grad_temp;
+      phi::DenseTensor z_grad_temp;
       z_grad_temp.Resize(conv_out_z->dims());
       sbar_x_op.Backward(dev_ctx,
                          *y_grad,
@@ -332,7 +341,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
                          eps);
 
       // 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z
-      Tensor conv_out_z_grad;
+      phi::DenseTensor conv_out_z_grad;
       conv_out_z_grad.Resize(conv_out_z->dims());
       CudnnScaleBiasAddRelu<T> sbar_z_op(
           dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape);
@@ -366,7 +375,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
     } else {
       // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
       // scale_x_grad, bias_x_grad (and z_grad)
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           fuse_add ? ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"))
                    : nullptr;
       sbar_x_op.Backward(dev_ctx,
diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
index 80986761c7cba..1e2741cde5d9e 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ResNetUnitXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
@@ -35,19 +33,22 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
     // input x
-    const Tensor *input_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
+    const phi::DenseTensor *input_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *bias_x = ctx.Input<phi::DenseTensor>("BiasX");
 
     // output x
-    Tensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
-    Tensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
-    Tensor *saved_invstd_x = ctx.Output<phi::DenseTensor>("SavedInvstdX");
-    Tensor *running_mean_x = ctx.Output<phi::DenseTensor>("RunningMeanX");
-    Tensor *running_var_x = ctx.Output<phi::DenseTensor>("RunningVarX");
+    phi::DenseTensor *conv_out_x = ctx.Output<phi::DenseTensor>("ConvX");
+    phi::DenseTensor *saved_mean_x = ctx.Output<phi::DenseTensor>("SavedMeanX");
+    phi::DenseTensor *saved_invstd_x =
+        ctx.Output<phi::DenseTensor>("SavedInvstdX");
+    phi::DenseTensor *running_mean_x =
+        ctx.Output<phi::DenseTensor>("RunningMeanX");
+    phi::DenseTensor *running_var_x =
+        ctx.Output<phi::DenseTensor>("RunningVarX");
 
-    Tensor *output = ctx.Output<phi::DenseTensor>("Y");
+    phi::DenseTensor *output = ctx.Output<phi::DenseTensor>("Y");
 
     //  attrs
     int padding = ctx.Attr<int>("padding");
@@ -101,16 +102,20 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
     std::vector<const float *> w_maxlist = {nullptr};
     if (has_shortcut) {
       // input z
-      const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
-
-      Tensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
-      Tensor *saved_mean_z = ctx.Output<phi::DenseTensor>("SavedMeanZ");
-      Tensor *saved_invstd_z = ctx.Output<phi::DenseTensor>("SavedInvstdZ");
-      Tensor *running_mean_z = ctx.Output<phi::DenseTensor>("RunningMeanZ");
-      Tensor *running_var_z = ctx.Output<phi::DenseTensor>("RunningVarZ");
+      const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *bias_z = ctx.Input<phi::DenseTensor>("BiasZ");
+
+      phi::DenseTensor *conv_out_z = ctx.Output<phi::DenseTensor>("ConvZ");
+      phi::DenseTensor *saved_mean_z =
+          ctx.Output<phi::DenseTensor>("SavedMeanZ");
+      phi::DenseTensor *saved_invstd_z =
+          ctx.Output<phi::DenseTensor>("SavedInvstdZ");
+      phi::DenseTensor *running_mean_z =
+          ctx.Output<phi::DenseTensor>("RunningMeanZ");
+      phi::DenseTensor *running_var_z =
+          ctx.Output<phi::DenseTensor>("RunningVarZ");
 
       x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
@@ -137,7 +142,7 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
       w_maxlist.push_back(nullptr);
     } else {
       if (fuse_add) {
-        const Tensor *input_z = ctx.Input<phi::DenseTensor>("Z");
+        const phi::DenseTensor *input_z = ctx.Input<phi::DenseTensor>("Z");
         auto input_z_shape = phi::vectorize<int>(input_z->dims());
         x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
         x_shape_list.push_back(input_z_shape);
@@ -189,22 +194,25 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("It must use XPUPlace."));
 
     bool is_nchw = (ctx.Attr<std::string>("data_format") == "NCHW");
-    const Tensor *y_grad =
+    const phi::DenseTensor *y_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const Tensor *x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
-    const Tensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
-    const Tensor *saved_mean_x = ctx.Input<phi::DenseTensor>("SavedMeanX");
-    const Tensor *saved_invstd_x = ctx.Input<phi::DenseTensor>("SavedInvstdX");
-    const Tensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
-    const Tensor *output = ctx.Input<phi::DenseTensor>("Y");
-
-    Tensor *x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    Tensor *filter_x_grad =
+    const phi::DenseTensor *x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *filter_x = ctx.Input<phi::DenseTensor>("FilterX");
+    const phi::DenseTensor *scale_x = ctx.Input<phi::DenseTensor>("ScaleX");
+    const phi::DenseTensor *saved_mean_x =
+        ctx.Input<phi::DenseTensor>("SavedMeanX");
+    const phi::DenseTensor *saved_invstd_x =
+        ctx.Input<phi::DenseTensor>("SavedInvstdX");
+    const phi::DenseTensor *conv_out_x = ctx.Input<phi::DenseTensor>("ConvX");
+    const phi::DenseTensor *output = ctx.Input<phi::DenseTensor>("Y");
+
+    phi::DenseTensor *x_grad =
+        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    phi::DenseTensor *filter_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterX"));
-    Tensor *scale_x_grad =
+    phi::DenseTensor *scale_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleX"));
-    Tensor *bias_x_grad =
+    phi::DenseTensor *bias_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasX"));
 
     int padding = ctx.Attr<int>("padding");
@@ -265,21 +273,22 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
       //          ScaleBiasAddRelu
       //                  |
       //                  Y
-      const Tensor *z = ctx.Input<phi::DenseTensor>("Z");
-      const Tensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
-      const Tensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
-      const Tensor *saved_mean_z = ctx.Input<phi::DenseTensor>("SavedMeanZ");
-      const Tensor *saved_invstd_z =
+      const phi::DenseTensor *z = ctx.Input<phi::DenseTensor>("Z");
+      const phi::DenseTensor *filter_z = ctx.Input<phi::DenseTensor>("FilterZ");
+      const phi::DenseTensor *scale_z = ctx.Input<phi::DenseTensor>("ScaleZ");
+      const phi::DenseTensor *saved_mean_z =
+          ctx.Input<phi::DenseTensor>("SavedMeanZ");
+      const phi::DenseTensor *saved_invstd_z =
           ctx.Input<phi::DenseTensor>("SavedInvstdZ");
-      const Tensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
+      const phi::DenseTensor *conv_out_z = ctx.Input<phi::DenseTensor>("ConvZ");
 
-      Tensor *z_grad =
+      phi::DenseTensor *z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("Z"));
-      Tensor *filter_z_grad =
+      phi::DenseTensor *filter_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("FilterZ"));
-      Tensor *scale_z_grad =
+      phi::DenseTensor *scale_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("ScaleZ"));
-      Tensor *bias_z_grad =
+      phi::DenseTensor *bias_z_grad =
           ctx.Output<phi::DenseTensor>(framework::GradVarName("BiasZ"));
       x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
       w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index 96646071567d5..f6fd97f918c07 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -29,7 +29,6 @@ template <typename DeviceContext, typename T>
 class SkipLayerNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    using Tensor = phi::DenseTensor;
     auto *X = context.Input<phi::DenseTensor>("X");
     auto *Y = context.Input<phi::DenseTensor>("Y");
     auto *scale = context.Input<phi::DenseTensor>("Scale");
diff --git a/paddle/fluid/operators/fused/xpu_fused_common_function.h b/paddle/fluid/operators/fused/xpu_fused_common_function.h
index 1a1ec8c47f9ba..63a22838e8c35 100644
--- a/paddle/fluid/operators/fused/xpu_fused_common_function.h
+++ b/paddle/fluid/operators/fused/xpu_fused_common_function.h
@@ -19,14 +19,13 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 struct XPUDropoutParam {
   float dropout_prob;
   bool is_upscale_in_train;
   bool is_test;
   bool fix_seed;
-  const Tensor *tensor_seed;
+  const phi::DenseTensor *tensor_seed;
   int seed_val;
 
   XPUDropoutParam() {
@@ -61,8 +60,9 @@ struct XPUDropoutParam {
       str_seed = str_seed + "Seed";
     }
 
-    tensor_seed =
-        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
+    tensor_seed = context.HasInput(str_seed)
+                      ? context.Input<phi::DenseTensor>(str_seed)
+                      : nullptr;
     if (tensor_seed) {
       seed_val = *(tensor_seed->data<int>());
     } else {
@@ -74,7 +74,7 @@ struct XPUDropoutParam {
                            bool is_upscale_in_train_,
                            bool is_test_,
                            bool fix_seed_,
-                           const Tensor *tensor_seed,
+                           const phi::DenseTensor *tensor_seed,
                            int seed_val_) {
     dropout_prob = dropout_prob_;
     is_upscale_in_train = is_upscale_in_train_;
@@ -108,8 +108,9 @@ struct XPUDropoutParam {
     } else {
       str_seed = str_seed + "Seed";
     }
-    tensor_seed =
-        context.HasInput(str_seed) ? context.Input<Tensor>(str_seed) : nullptr;
+    tensor_seed = context.HasInput(str_seed)
+                      ? context.Input<phi::DenseTensor>(str_seed)
+                      : nullptr;
 
     if (tensor_seed) {
       seed_val = *(tensor_seed->data<int>());
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
index 696cab20db714..88d589f85b0ec 100644
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -67,7 +67,6 @@ template <typename T>
 class YoloBoxHeadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = phi::DenseTensor;
     auto* x = context.Input<phi::DenseTensor>("X");
     auto* out = context.Output<phi::DenseTensor>("Out");
     auto anchors = context.Attr<std::vector<int>>("anchors");
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 072f0374c5b82..fc01d7027f31d 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -319,7 +319,6 @@ template <typename T>
 class YoloBoxPostKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using Tensor = phi::DenseTensor;
     // prepare inputs
     std::vector<const float*> boxes_input(3);
     std::vector<std::vector<int32_t>> boxes_input_dims(3);
diff --git a/paddle/fluid/operators/gather_nd_op_mlu.cc b/paddle/fluid/operators/gather_nd_op_mlu.cc
index b6c96e3c2edd5..93b20c86af860 100644
--- a/paddle/fluid/operators/gather_nd_op_mlu.cc
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class GatherNdMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
index 5cea840b4aec5..6629d369db0c6 100644
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ b/paddle/fluid/operators/gather_nd_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cc b/paddle/fluid/operators/gather_scatter_kernel.cc
index b579b3175d396..1c6b2e6c1a095 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cc
+++ b/paddle/fluid/operators/gather_scatter_kernel.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TensorAssign {
  public:
   template <typename tensor_t>
@@ -50,7 +48,7 @@ template <typename tensor_t,
           bool is_scatter_like = true>
 struct cpu_gather_scatter_functor {
   template <typename func_t>
-  void operator()(Tensor self,
+  void operator()(phi::DenseTensor self,
                   int dim,
                   const phi::DenseTensor& index,
                   const phi::DenseTensor& src,
@@ -130,10 +128,10 @@ struct cpu_gather_scatter_functor {
 };
 
 template <typename tensor_t, typename index_t>
-void cpu_gather_kernel(Tensor self,
+void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -142,10 +140,10 @@ void cpu_gather_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_assign_kernel(Tensor self,
+void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -154,10 +152,10 @@ void cpu_scatter_assign_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_add_kernel(Tensor self,
+void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -166,10 +164,10 @@ void cpu_scatter_add_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_kernel(Tensor self,
+void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -178,10 +176,10 @@ void cpu_scatter_mul_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_input_grad_kernel(Tensor self,
+void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor output,
+                                   phi::DenseTensor output,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* output_data = output.data<tensor_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index 2f17b946c6149..1cb4e4a4e9d78 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TensorAssign {
  public:
   template <typename tensor_t>
@@ -107,10 +105,10 @@ template <typename tensor_t,
           bool is_scatter_like = true>
 struct gpu_gather_scatter_functor {
   template <typename func_t>
-  void operator()(Tensor self,
+  void operator()(phi::DenseTensor self,
                   int dim,
                   const phi::DenseTensor& index,
-                  Tensor src,
+                  phi::DenseTensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
                   const platform::DeviceContext& ctx) {
@@ -160,10 +158,10 @@ struct gpu_gather_scatter_functor {
 };  // struct gpu_gather_scatter_functor
 
 template <typename tensor_t, typename index_t>
-void gpu_gather_kernel(Tensor self,
+void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -173,10 +171,10 @@ void gpu_gather_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_assign_kernel(Tensor self,
+void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -185,10 +183,10 @@ void gpu_scatter_assign_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_add_kernel(Tensor self,
+void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -197,10 +195,10 @@ void gpu_scatter_add_kernel(Tensor self,
 }
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_mul_kernel(Tensor self,
+void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
@@ -230,10 +228,10 @@ __global__ void ScatterInputGradGPUKernel(tensor_t* grad_data,
   grad_data[replace_index] = 0;
 }
 template <typename tensor_t, typename index_t>
-void gpu_scatter_input_grad_kernel(Tensor self,
+void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor grad,
+                                   phi::DenseTensor grad,
                                    const platform::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
diff --git a/paddle/fluid/operators/gather_scatter_kernel.h b/paddle/fluid/operators/gather_scatter_kernel.h
index b97451b488b92..9cf3c3e33009a 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.h
+++ b/paddle/fluid/operators/gather_scatter_kernel.h
@@ -30,87 +30,85 @@ namespace operators {
                   Instantiate_Template_Function_index_t(func, unsigned char)
 
 #define Instantiate_Template_Function_index_t(func, tensor_t)            \
-  template void func<tensor_t, int>(Tensor input,                        \
+  template void func<tensor_t, int>(phi::DenseTensor input,              \
                                     int dim,                             \
                                     const phi::DenseTensor& index,       \
-                                    Tensor result,                       \
+                                    phi::DenseTensor result,             \
                                     const platform::DeviceContext& ctx); \
-  template void func<tensor_t, int64_t>(Tensor input,                    \
+  template void func<tensor_t, int64_t>(phi::DenseTensor input,          \
                                         int dim,                         \
                                         const phi::DenseTensor& index,   \
-                                        Tensor result,                   \
+                                        phi::DenseTensor result,         \
                                         const platform::DeviceContext& ctx);
 
-using Tensor = phi::DenseTensor;
-
 template <typename tensor_t, typename index_t>
-void cpu_gather_kernel(Tensor self,
+void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_assign_kernel(Tensor self,
+void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_add_kernel(Tensor self,
+void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_mul_kernel(Tensor self,
+void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void cpu_scatter_input_grad_kernel(Tensor self,
+void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor result,
+                                   phi::DenseTensor result,
                                    const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_gather_kernel(Tensor self,
+void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
-                       Tensor result,
+                       phi::DenseTensor result,
                        const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_assign_kernel(Tensor self,
+void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
-                               Tensor src,
+                               phi::DenseTensor src,
                                const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_add_kernel(Tensor self,
+void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_mul_kernel(Tensor self,
+void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
-                            Tensor src,
+                            phi::DenseTensor src,
                             const platform::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
-void gpu_scatter_input_grad_kernel(Tensor self,
+void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
-                                   Tensor result,
+                                   phi::DenseTensor result,
                                    const platform::DeviceContext& ctx);
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index ee095c598bc1b..0f81d7fec3184 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/gaussian_random_op_mlu.cc b/paddle/fluid/operators/gaussian_random_op_mlu.cc
index a70ddc428d840..5128cc9502581 100644
--- a/paddle/fluid/operators/gaussian_random_op_mlu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class MLUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -30,7 +29,7 @@ class MLUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->type());
+    phi::DenseTensor cpu_tensor(tensor->type());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::normal_distribution<T> dist(mean, std);
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
index 3523eb7379399..5e3fa3dbef5e6 100644
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_npu.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class NPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -32,7 +31,7 @@ class NPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::normal_distribution<T> dist(mean, std);
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index f462336b412a3..7f6d5be9d0c73 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class GeluNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 2e703282bf932..39767b5e20a87 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -49,8 +49,6 @@ constexpr int WARP_SIZE = 32;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct MaxFunctor {
   T cap;
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index 278bbd5efd723..f5ec87f23c88b 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -28,8 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <class bidiiter>
 void SampleUniqueNeighbors(bidiiter begin, bidiiter end, int num_samples) {
   int left_num = std::distance(begin, end);
diff --git a/paddle/fluid/operators/grid_sampler_op_mlu.cc b/paddle/fluid/operators/grid_sampler_op_mlu.cc
index f71969d8b551c..07aa025a9a26c 100644
--- a/paddle/fluid/operators/grid_sampler_op_mlu.cc
+++ b/paddle/fluid/operators/grid_sampler_op_mlu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class GridSamplerMLUKernel : public framework::OpKernel<T> {
  public:
@@ -60,13 +58,13 @@ class GridSamplerMLUKernel : public framework::OpKernel<T> {
         platform::errors::Unavailable(
             "Only support zeros padding_mode in mlu grid_sample kernel."));
 
-    Tensor trans_input(input->dtype());
+    phi::DenseTensor trans_input(input->dtype());
     // transpose input from NCHW to NHWC
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
 
-    Tensor tmp_output(output->dtype());
+    phi::DenseTensor tmp_output(output->dtype());
     tmp_output.mutable_data<T>({n, out_h, out_w, c}, ctx.GetPlace());
 
     MLUCnnlGridSampleDesc grid_sample_desc(mode, padding_mode, align_corners);
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 3d6566d62b2a7..7331c792ea568 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class GroupNormOp : public framework::OperatorWithKernel {
@@ -123,16 +122,16 @@ class GroupNormGradOp : public framework::OperatorWithKernel {
         var,
         platform::errors::InvalidArgument(
             "Input(Y@GRAD) of GroupNormGradOp should not be null"));
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
-    PADDLE_ENFORCE_NOT_NULL(
-        t,
-        platform::errors::InvalidArgument(
-            "Input(Y@GRAD) Tensor of GroupNormGradOp should not be null"));
+    PADDLE_ENFORCE_NOT_NULL(t,
+                            platform::errors::InvalidArgument(
+                                "Input(Y@GRAD) phi::DenseTensor of "
+                                "GroupNormGradOp should not be null"));
     return framework::OpKernelType(framework::TransToProtoVarType(t->dtype()),
                                    ctx.GetPlace());
   }
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 6b2ba1670a3b7..9cb4e54ac0054 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -291,7 +291,7 @@ class GroupNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     var->mutable_data<T>(ctx.GetPlace());
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    Tensor temp_var;
+    phi::DenseTensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
     auto* x_data = x->data<T>();
     auto* y_data = y->data<T>();
@@ -642,7 +642,7 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
     phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
-    Tensor ds, db;
+    phi::DenseTensor ds, db;
     ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
     db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
     T* ds_data = ds.data<T>();
@@ -728,7 +728,7 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
         // p1 = scale * var_inv
         // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
         // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
-        Tensor p1, p2, p3;
+        phi::DenseTensor p1, p2, p3;
         p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
         p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
         p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
@@ -770,12 +770,12 @@ class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
         set_zero(dev_ctx, d_bias, static_cast<T>(0));
       }
 
-      Tensor temp_var;
+      phi::DenseTensor temp_var;
       temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
       set_zero(dev_ctx, &temp_var, static_cast<T>(0));
       T* temp_var_data = temp_var.data<T>();
 
-      Tensor temp_mean;
+      phi::DenseTensor temp_mean;
       temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
       set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
       T* temp_mean_data = temp_mean.data<T>();
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 0ce89b4625a13..95cdeefc783f4 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index 5fded4cffc713..2c0dec9dd4d0b 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 struct GroupNormFunction {
  public:
@@ -103,14 +101,14 @@ struct GroupNormFunction {
     const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
     runner.Run(stream);
   }
-  Tensor ReduceMeanToNG(const phi::DenseTensor* x,
-                        const DataLayout& data_layout,
-                        const int64_t N,
-                        const int64_t C,
-                        const int64_t H,
-                        const int64_t W,
-                        const int G) {
-    Tensor y(x->type());
+  phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x,
+                                  const DataLayout& data_layout,
+                                  const int64_t N,
+                                  const int64_t C,
+                                  const int64_t H,
+                                  const int64_t W,
+                                  const int G) {
+    phi::DenseTensor y(x->type());
     // y.mutable_data<T>( {N,G,1}, place );
     if (data_layout == DataLayout::kNCHW) {
       y.mutable_data<T>({N, G, 1}, place);
@@ -119,7 +117,7 @@ struct GroupNormFunction {
     } else {
       y.mutable_data<T>({N, 1, G}, place);
       //  shape of x is [N, C*H*W/G, G]
-      Tensor x_trans(x->type());
+      phi::DenseTensor x_trans(x->type());
       x_trans.mutable_data<T>({N, G, C * H * W / G}, place);
       this->Transpose(x, &x_trans, std::vector<int>{0, 2, 1});
       this->ReduceMean(&x_trans, &y, std::vector<int>{2});
@@ -150,7 +148,7 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     const auto groups = ctx.Attr<int>("groups");
 
     auto place = ctx.GetPlace();
-    Tensor xnorm(x->type());
+    phi::DenseTensor xnorm(x->type());
     xnorm.mutable_data<T>(x->dims(), place);
     GroupNormFunction<T> F(ctx);
     if (data_layout != DataLayout::kNCHW) {
@@ -173,12 +171,12 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     F.ReduceMean(&xnorm, mean, axis);
 
     F.Sub(&xnorm, mean, &xnorm);
-    Tensor sqr(x->type());
+    phi::DenseTensor sqr(x->type());
     sqr.mutable_data<T>(xnorm.dims(), place);
 
     F.Mul(&xnorm, &xnorm, &sqr);
     F.ReduceMean(&sqr, var, axis);
-    Tensor std(x->type());
+    phi::DenseTensor std(x->type());
     std.mutable_data<T>(var->dims(), place);
     F.Adds(var, epsilon, &std);
     F.Sqrt(&std, &std);
@@ -186,13 +184,13 @@ class GroupNormNPUKernel : public framework::OpKernel<T> {
     F.Div(&xnorm, &std, y);
     y->Resize({N, C, H, W});
     if (scale) {
-      Tensor scale_t(scale->type());
+      phi::DenseTensor scale_t(scale->type());
       scale_t.ShareDataWith(*scale);
       scale_t.Resize({C, 1, 1});
       F.Mul(y, &scale_t, y);
     }
     if (bias) {
-      Tensor bias_t(bias->type());
+      phi::DenseTensor bias_t(bias->type());
       bias_t.ShareDataWith(*bias);
       bias_t.Resize({C, 1, 1});
       F.Add(y, &bias_t, y);
@@ -231,11 +229,11 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     auto _type = y->type();
 
-    Tensor xnorm(_type);
+    phi::DenseTensor xnorm(_type);
     xnorm.mutable_data<T>(y->dims(), place);
-    Tensor scale_share(_type);
+    phi::DenseTensor scale_share(_type);
     scale_share.ShareDataWith(*scale);
-    Tensor bias_share(_type);
+    phi::DenseTensor bias_share(_type);
     bias_share.ShareDataWith(*bias);
 
     int64_t N = y->dims()[0];
@@ -267,7 +265,7 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     }
     if (d_scale) {
       d_scale->mutable_data<T>(place);
-      Tensor dy_xnorm(_type);
+      phi::DenseTensor dy_xnorm(_type);
       dy_xnorm.mutable_data<T>(d_y->dims(), place);
       F.Mul(d_y, &xnorm, &dy_xnorm);
       if (data_layout == DataLayout::kNCHW) {
@@ -278,12 +276,12 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     //  std = Sqrt(var+epsilon), init shape = [ N, G ]
-    Tensor std(_type);
+    phi::DenseTensor std(_type);
     std.mutable_data<T>(var->dims(), place);
     F.Adds(var, epsilon, &std);
     F.Sqrt(&std, &std);
     //  d_xnorm_std = dy_proc * scale / std
-    Tensor d_xnorm_std(_type);
+    phi::DenseTensor d_xnorm_std(_type);
     d_xnorm_std.mutable_data<T>(y->dims(), place);
     F.Mul(d_y, &scale_share, &d_xnorm_std);
     if (data_layout == DataLayout::kNCHW) {
@@ -303,10 +301,11 @@ class GroupNormGradNPUKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(place);
     d_x->Resize(xnorm.dims());
     F.Mul(&d_xnorm_std, &xnorm, d_x);
-    Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
+    phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
     F.Mul(&dx1, &xnorm, d_x);
 
-    Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
+    phi::DenseTensor dx2 =
+        F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
 
     F.Sub(&d_xnorm_std, d_x, d_x);
     F.Sub(d_x, &dx2, d_x);
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index cceecdcad5fd2..1c10692d15fad 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -370,7 +370,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -440,10 +440,10 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         int bend = static_cast<int>(batch_starts[n + 1]);
         int cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
+        phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+        phi::DenseTensor reset_hidden_prev_t =
             batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         gru_value.output_value = hidden_t.data<T>();
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
@@ -505,10 +505,10 @@ class GRUCPUKernel : public framework::OpKernel<T> {
         int bend = static_cast<int>(batch_starts[n + 1]);
         int cur_batch_size = bend - bstart;
 
-        Tensor gate_t = batch_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
+        phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+        phi::DenseTensor reset_hidden_prev_t =
             batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         gru_value.output_value = hidden_t.data<T>();
         gru_value.gate_value = gate_t.data<T>();
         gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index a6b57bd88f77d..53006c55f6b98 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -73,7 +73,7 @@ class GRUKernel : public framework::OpKernel<T> {
     gru_value.gate_weight = const_cast<T*>(weight_data);
     gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -102,9 +102,10 @@ class GRUKernel : public framework::OpKernel<T> {
       int bend = static_cast<int>(batch_starts[n + 1]);
       int cur_batch_size = bend - bstart;
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor reset_hidden_prev_t =
+          batch_reset_hidden_prev->Slice(bstart, bend);
+      phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
       gru_value.output_value = hidden_t.data<T>();
       gru_value.gate_value = gate_t.data<T>();
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 89731e2efa022..286bf9fe2732d 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const phi::DenseTensor& src,
@@ -79,7 +77,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
 
-    Tensor ordered_h0, ordered_h0_grad;
+    phi::DenseTensor ordered_h0, ordered_h0_grad;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -126,16 +124,17 @@ class GRUGradKernel : public framework::OpKernel<T> {
       int bend = static_cast<int>(batch_starts[n + 1]);
       int cur_batch_size = bend - bstart;
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
       gru_value.gate_value = gate_t.data<T>();
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      phi::DenseTensor reset_hidden_prev_t =
+          batch_reset_hidden_prev->Slice(bstart, bend);
       gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
-      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      phi::DenseTensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
       gru_grad.output_grad = hidden_grad_t.data<T>();
-      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      phi::DenseTensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
       gru_grad.gate_grad = gate_grad_t.data<T>();
-      Tensor reset_hidden_prev_grad_t =
+      phi::DenseTensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
       gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
@@ -144,9 +143,11 @@ class GRUGradKernel : public framework::OpKernel<T> {
             h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        phi::DenseTensor hidden_prev_t =
+            batch_hidden->Slice(bstart_pre, bstart);
         gru_value.prev_out_value = hidden_prev_t.data<T>();
-        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        phi::DenseTensor hidden_prev_grad_t =
+            batch_hidden_grad.Slice(bstart_pre, bstart);
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
       gru_value.output_value = nullptr;
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 3ed3179a63e63..d46e6cf429f6f 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
 template <typename DeviceContext, typename T>
@@ -192,8 +190,8 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
         context.Output<phi::DenseTensor>(framework::GradVarName("Weight"));
     auto* bias_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    Tensor gate_grad;
-    Tensor reset_hidden_prev_grad;
+    phi::DenseTensor gate_grad;
+    phi::DenseTensor reset_hidden_prev_grad;
 
     const T* hidden_prev_data = hidden_prev->data<T>();
     const T* weight_data = weight->data<T>();
diff --git a/paddle/fluid/operators/huber_loss_op_mlu.cc b/paddle/fluid/operators/huber_loss_op_mlu.cc
index 4387037ad01af..4dc542b675f54 100644
--- a/paddle/fluid/operators/huber_loss_op_mlu.cc
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
@@ -18,17 +18,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class HuberLossMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* residual = ctx.Output<Tensor>("Residual");
-    auto* out = ctx.Output<Tensor>("Out");
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* y = ctx.Input<phi::DenseTensor>("Y");
+    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
     auto delta = ctx.Attr<float>("delta");
 
     auto place = ctx.GetPlace();
@@ -65,7 +63,7 @@ class HuberLossMLUKernel : public framework::OpKernel<T> {
                                  GetBasePtr(out));
 
     // compute multiply by delta
-    Tensor scale_tensor, bias_tensor;
+    phi::DenseTensor scale_tensor, bias_tensor;
     scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
@@ -93,20 +91,20 @@ class HuberLossGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto& dev_ctx = GetDevCtxFromCTX(ctx);
-    auto* residual = ctx.Input<Tensor>("Residual");
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
+    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
     auto delta = ctx.Attr<float>("delta");
 
     auto place = ctx.GetPlace();
 
-    Tensor t_grad_rd;
+    phi::DenseTensor t_grad_rd;
     t_grad_rd =
         ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
     MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
     if (dx || dy) {
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       t_zero =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
@@ -130,7 +128,7 @@ class HuberLossGradMLUKernel : public framework::OpKernel<T> {
                                     GetBasePtr(&t_grad_rd));
     }
     // compute multiply by delta
-    Tensor scale_tensor, bias_tensor;
+    phi::DenseTensor scale_tensor, bias_tensor;
     scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
 
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index a7be6feb628bf..78529df55aa94 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void HuberLossSub(const platform::Place& place,
                   const aclrtStream& stream,
@@ -117,9 +115,9 @@ class HuberLossGradNPUKernel : public framework::OpKernel<T> {
             .stream();
     auto place = ctx.GetPlace();
 
-    Tensor t_grad_rd;
+    phi::DenseTensor t_grad_rd;
     if (dx || dy) {
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       HuberLossZerosLike<T>(place, stream, residual, &t_zero);
       HuberLossSmoothL1LossGrad<T>(
           place, stream, residual, &t_zero, dout, delta, &t_grad_rd);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a9da8f8f4dbbc..523639faddcbe 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -26,8 +26,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 inline int Im2SeqOutputSize(
     int input_size, int filter_size, int padding_0, int padding_1, int stride) {
   const int output_size =
@@ -52,7 +50,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
     if (ctx.HasInput("Y") && batch_size > 1) {
       const phi::DenseTensor* imgrealsize = ctx.Input<phi::DenseTensor>("Y");
       auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
-      Tensor cpu_shape_tensor;
+      phi::DenseTensor cpu_shape_tensor;
       paddle::framework::TensorCopySync(
           *imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
       std::vector<int> imgreal_h;
@@ -89,15 +87,16 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
       const std::vector<int> dilations({1, 1});
       int offset_out = 0;
       for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
+        const phi::DenseTensor src =
             in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst = out->Slice(offset_out,
-                                offset_out + output_height[i] * output_width[i])
-                         .Resize({output_height[i],
-                                  output_width[i],
-                                  img_channels,
-                                  kernels[0],
-                                  kernels[1]});
+        phi::DenseTensor dst =
+            out->Slice(offset_out,
+                       offset_out + output_height[i] * output_width[i])
+                .Resize({output_height[i],
+                         output_width[i],
+                         img_channels,
+                         kernels[0],
+                         kernels[1]});
         offset_out += output_height[i] * output_width[i];
 
         phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
@@ -127,13 +126,13 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
       auto out_dims = out->dims();
       out->Resize({batch_size, out->numel() / batch_size});
       for (int i = 0; i < batch_size; i++) {
-        const Tensor src =
+        const phi::DenseTensor src =
             in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-        Tensor dst = out->Slice(i, i + 1).Resize({output_height,
-                                                  output_width,
-                                                  img_channels,
-                                                  kernels[0],
-                                                  kernels[1]});
+        phi::DenseTensor dst = out->Slice(i, i + 1).Resize({output_height,
+                                                            output_width,
+                                                            img_channels,
+                                                            kernels[0],
+                                                            kernels[1]});
 
         phi::funcs::Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
             f;
@@ -187,9 +186,9 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     d_out->Resize({batch_size, d_out->numel() / batch_size});
     for (int i = 0; i < batch_size; i++) {
-      Tensor dst =
+      phi::DenseTensor dst =
           d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      const Tensor src = d_out->Slice(i, i + 1).Resize(
+      const phi::DenseTensor src = d_out->Slice(i, i + 1).Resize(
           {output_height, output_width, img_channels, kernels[0], kernels[1]});
       phi::funcs::Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T>
           f;
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index 425590ebeeb52..0e7f1fea1bd81 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename IndexT>
 void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
@@ -38,7 +37,7 @@ void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
       gather_index_vec.push_back(index_vec[i * index_length + j]);
     }
   }
-  Tensor gather_index;
+  phi::DenseTensor gather_index;
   framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index);
   gather_index.Resize({batch_size, index_length, 2});
 
@@ -89,7 +88,7 @@ void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
       scatter_index_vec.push_back(index_vec[i * index_length + j]);
     }
   }
-  Tensor scatter_index;
+  phi::DenseTensor scatter_index;
   framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
   scatter_index.Resize({batch_size, index_length, 2});
 
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index a705a95156608..6bb91f325f953 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -22,7 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 template <typename DeviceContext, typename T, typename IndexT = int>
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index 0f18f9793d305..327471b216f0b 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class IndexSelectNPUKernel : public framework::OpKernel<T> {
  public:
@@ -66,7 +64,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
       dim += out_dims.size();
     }
 
-    Tensor casted_index;
+    phi::DenseTensor casted_index;
     if (framework::TransToProtoVarType(index->dtype()) !=
         framework::proto::VarType::INT32) {
       casted_index.mutable_data<int32_t>(index->dims(), ctx.GetPlace());
@@ -90,7 +88,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
           .AddOutput(*x_grad);
       runner.Run(stream);
     } else {
-      Tensor transed_out_grad;
+      phi::DenseTensor transed_out_grad;
       std::vector<int> in_trans_perm;
       in_trans_perm.push_back(dim);
       for (int i = 0; i < out_dims.size(); ++i) {
@@ -109,7 +107,7 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
           .AddOutput(transed_out_grad);
       in_trans_runner.Run(stream);
 
-      Tensor sum_out;
+      phi::DenseTensor sum_out;
       framework::DDim sum_dims(x_dims);
       sum_dims[0] = x_dims[dim];
       auto idx = 1;
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 53453c6cad184..a80324d5d303a 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -145,8 +145,8 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
           "can't find gradient variable of Y"));
     }
     const phi::DenseTensor* t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
@@ -323,9 +323,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto* mean = ctx.Input<phi::DenseTensor>("ReserveSpace");
     auto* variance = ctx.Input<phi::DenseTensor>("ReserveSpace");
 
-    paddle::optional<Tensor> space_opt;
-    paddle::optional<Tensor> mean_opt;
-    paddle::optional<Tensor> variance_opt;
+    paddle::optional<phi::DenseTensor> space_opt;
+    paddle::optional<phi::DenseTensor> mean_opt;
+    paddle::optional<phi::DenseTensor> variance_opt;
 
     if (reserve_space != nullptr) {
       space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index e1131822f289e..bec88e5dfd2a7 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -171,9 +171,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
           scale_grad,
           bias_grad);
     } else {
-      paddle::optional<Tensor> space_opt;
-      paddle::optional<Tensor> mean_opt;
-      paddle::optional<Tensor> variance_opt;
+      paddle::optional<phi::DenseTensor> space_opt;
+      paddle::optional<phi::DenseTensor> mean_opt;
+      paddle::optional<phi::DenseTensor> variance_opt;
 
       if (reserve_space != nullptr) {
         space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 2a9568e845492..29253662d4deb 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -22,7 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index ed474193461c3..c9f33799c9e10 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -105,9 +105,9 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
@@ -126,9 +126,9 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
     PADDLE_THROW(
         platform::errors::NotFound("cannot find gradient variable of Y"));
   }
-  const Tensor *t = nullptr;
-  if (var->IsType<Tensor>()) {
-    t = &var->Get<Tensor>();
+  const phi::DenseTensor *t = nullptr;
+  if (var->IsType<phi::DenseTensor>()) {
+    t = &var->Get<phi::DenseTensor>();
   } else if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
   }
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 2101f6a12bb53..05e2bde973924 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class InstanceNormOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
index f46c3a806a2c0..f11719bea9c7c 100644
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ b/paddle/fluid/operators/instance_norm_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class InstanceNormNPUKernel : public framework::OpKernel<T> {
@@ -56,7 +55,7 @@ class InstanceNormNPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_x, tmp_y;
+    phi::DenseTensor tmp_x, tmp_y;
     tmp_x.ShareDataWith(*x);
 
     tmp_x.Resize(phi::make_ddim(tmp_x_dims));
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index a589b49500e0a..a0e1410f52d3d 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -945,7 +945,7 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_w = size_data[0];
@@ -1040,7 +1040,7 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_h = size_data[0];
@@ -1195,7 +1195,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     }
     auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
     if (out_size != nullptr) {
-      Tensor sizes;
+      phi::DenseTensor sizes;
       framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
       auto size_data = sizes.data<int>();
       out_d = size_data[0];
@@ -1288,7 +1288,7 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
                                  phi::DenseTensor* input_grad,
-                                 const Tensor output_grad) {
+                                 const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -1314,7 +1314,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_w = size_data[0];
@@ -1379,7 +1379,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
                                  phi::DenseTensor* input_grad,
-                                 const Tensor output_grad) {
+                                 const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -1407,7 +1407,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_h = size_data[0];
@@ -1555,7 +1555,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
 
   auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
   if (out_size != nullptr) {
-    Tensor sizes;
+    phi::DenseTensor sizes;
     framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes);
     auto size_data = sizes.data<int>();
     out_d = size_data[0];
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 11c1429107654..ad67efc4b78d5 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -26,7 +26,6 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = phi::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 inline std::vector<int> get_new_shape(
@@ -1344,7 +1343,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
 template <typename T>
 static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
                                 phi::DenseTensor* input_grad,
-                                const Tensor output_grad) {
+                                const phi::DenseTensor output_grad) {
   auto* input = ctx.Input<phi::DenseTensor>("X");
   const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
   const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
index a059d5b522ee3..36ffd1ae53ed6 100644
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 inline static void CheckArgument(const framework::ExecutionContext& ctx) {
@@ -136,7 +135,7 @@ class InterpolateNPUKernel : public framework::OpKernel<T> {
     CalcOutSize(ctx, h, w, &out_h, &out_w);
 
     // the 'input' tensor may has no set (or wrong set) of the layout
-    Tensor input_x(input->type());
+    phi::DenseTensor input_x(input->type());
     input_x.ShareDataWith(*input);
     input_x.set_layout(data_layout);
 
@@ -188,7 +187,7 @@ class InterpolateGradNPUKernel : public framework::OpKernel<T> {
     // the 'output_grad' tensor may has no set (or wrong set) of the layout
     auto* output_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor output_grad_tmp(output_grad->type());
+    phi::DenseTensor output_grad_tmp(output_grad->type());
     output_grad_tmp.ShareDataWith(*output_grad);
     output_grad_tmp.set_layout(data_layout);
 
diff --git a/paddle/fluid/operators/interpolate_v2_op_mlu.cc b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
index 833d650d6a131..e6f34539b1c01 100644
--- a/paddle/fluid/operators/interpolate_v2_op_mlu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc
@@ -175,7 +175,7 @@ class InterpolateV2MLUKernel : public framework::OpKernel<T> {
     // cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
     // CNNL_INTERP_NEAREST,
     framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans;
-    Tensor transformed_input, transformed_output;
+    phi::DenseTensor transformed_input, transformed_output;
     bool need_transpose = input_dims.size() != 2;
     if (input_dims.size() == 4) {
       // need to do transpose if layout is kNCHW
@@ -439,7 +439,7 @@ class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
     framework::DDim dim_grad;
     framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad,
         dim_in_trans_grad;
-    Tensor transformed_output_grad, transformed_input_grad;
+    phi::DenseTensor transformed_output_grad, transformed_input_grad;
     bool need_transpose =
         input_dims.size() != 2 && data_layout == DataLayout::kNCHW;
 
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index 69d2f563e37bc..31f08badd128c 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using DDim = framework::DDim;
 using fp16 = paddle::platform::float16;
@@ -104,7 +103,7 @@ struct InterpolateFunction {
     auto yt = y_new_shape[axis];
     y_new_shape[axis] = y_new_shape[0];
     y_new_shape[0] = yt;
-    Tensor gy_t;
+    phi::DenseTensor gy_t;
     gy_t.mutable_data<T>(y_new_shape, place);
     Transpose(gy, &gy_t, axis_swap);
     //  2  scatter
@@ -112,7 +111,7 @@ struct InterpolateFunction {
     auto xt = x_new_shape[axis];
     x_new_shape[axis] = x_new_shape[0];
     x_new_shape[0] = xt;
-    Tensor gx_zero, gx_t;
+    phi::DenseTensor gx_zero, gx_t;
     gx_zero.mutable_data<T>(x_new_shape, place);
     gx_t.mutable_data<T>(x_new_shape, place);
     FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
@@ -161,14 +160,14 @@ struct InterpolateFunction {
   platform::Place place;
   aclrtStream stream;
   const framework::ExecutionContext& ctx;
-  Tensor t0;
-  Tensor t1;
-  Tensor tn;
+  phi::DenseTensor t0;
+  phi::DenseTensor t1;
+  phi::DenseTensor tn;
 };
 
 template <>
 void InterpolateFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  Tensor x_fp32(experimental::DataType::FLOAT32);
+  phi::DenseTensor x_fp32(experimental::DataType::FLOAT32);
   x_fp32.mutable_data<float>(x->dims(), place);
   FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
   const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
@@ -238,7 +237,7 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
                                 phi::DenseTensor* coef_w1) {
   InterpolateFunction<T> F(ctx);
   auto place = ctx.GetPlace();
-  Tensor _h0, _w0;
+  phi::DenseTensor _h0, _w0;
   _h0.mutable_data<T>({out_h}, place);
   _w0.mutable_data<T>({out_w}, place);
   F.Arange(out_h, &_h0);
@@ -255,8 +254,8 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
     F.Muls(&_w0, ratio_w, &_w0);
   }
 
-  Tensor zero_t;
-  Tensor one_t;
+  phi::DenseTensor zero_t;
+  phi::DenseTensor one_t;
   zero_t.mutable_data<T>({1}, place);
   one_t.mutable_data<T>({1}, place);
   FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
@@ -264,7 +263,7 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
   F.Maximum(&_h0, &zero_t, &_h0);
   F.Maximum(&_w0, &zero_t, &_w0);
 
-  Tensor _h0_floor, _w0_floor;
+  phi::DenseTensor _h0_floor, _w0_floor;
   _h0_floor.mutable_data<T>({out_h}, place);
   _w0_floor.mutable_data<T>({out_w}, place);
   F.Floor(&_h0, &_h0_floor);
@@ -272,12 +271,12 @@ void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
   F.Cast(&_h0_floor, h0);
   F.Cast(&_w0_floor, w0);
 
-  Tensor one_int;
+  phi::DenseTensor one_int;
   one_int.mutable_data<int>({1}, place);
   FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
   F.Add(h0, &one_int, h1);
   F.Add(w0, &one_int, w1);
-  Tensor t_max_h, t_max_w;
+  phi::DenseTensor t_max_h, t_max_w;
   t_max_h.mutable_data<int>({1}, place);
   t_max_w.mutable_data<int>({1}, place);
   FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
@@ -334,12 +333,12 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
                           &ratio_h,
                           &ratio_w);
 
-  Tensor h0, h1, w0, w1;
+  phi::DenseTensor h0, h1, w0, w1;
   h0.mutable_data<int>({out_h}, place);
   h1.mutable_data<int>({out_h}, place);
   w0.mutable_data<int>({out_w}, place);
   w1.mutable_data<int>({out_w}, place);
-  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
   coef_h0.mutable_data<T>({out_h}, place);
   coef_h1.mutable_data<T>({out_h}, place);
   coef_w0.mutable_data<T>({out_w}, place);
@@ -363,7 +362,7 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
                                 &coef_w0,
                                 &coef_w1);
 
-  Tensor input_gather_h0, input_gather_h1;
+  phi::DenseTensor input_gather_h0, input_gather_h1;
   auto dim_gather_h = indim;
   dim_gather_h[axis_h] = out_h;
   input_gather_h0.mutable_data<T>(dim_gather_h, place);
@@ -374,13 +373,13 @@ void BilinearFwdNpu(const framework::ExecutionContext& ctx,
 
   F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
   F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
-  Tensor out_x4;
+  phi::DenseTensor out_x4;
   out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
                          place);
-  Tensor input_gather_h0_w0 = out_x4.Slice(0, 1);
-  Tensor input_gather_h0_w1 = out_x4.Slice(1, 2);
-  Tensor input_gather_h1_w0 = out_x4.Slice(2, 3);
-  Tensor input_gather_h1_w1 = out_x4.Slice(3, 4);
+  phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1);
+  phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2);
+  phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3);
+  phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4);
   F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
   F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
   F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
@@ -425,12 +424,12 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
                           &ratio_h,
                           &ratio_w);
 
-  Tensor h0, h1, w0, w1;
+  phi::DenseTensor h0, h1, w0, w1;
   h0.mutable_data<int>({out_h}, place);
   h1.mutable_data<int>({out_h}, place);
   w0.mutable_data<int>({out_w}, place);
   w1.mutable_data<int>({out_w}, place);
-  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
   coef_h0.mutable_data<T>({out_h}, place);
   coef_h1.mutable_data<T>({out_h}, place);
   coef_w0.mutable_data<T>({out_w}, place);
@@ -454,7 +453,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
                                 &coef_w0,
                                 &coef_w1);
 
-  Tensor gy_w0, gy_w1;
+  phi::DenseTensor gy_w0, gy_w1;
   gy_w0.mutable_data<T>(outdim, place);
   gy_w1.mutable_data<T>(outdim, place);
   F.Mul(gout, &coef_w0, &gy_w0);
@@ -462,7 +461,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
 
   auto dim_gather_h = indim;
   dim_gather_h[axis_h] = out_h;
-  Tensor g_gather_w0, g_gather_w1;
+  phi::DenseTensor g_gather_w0, g_gather_w1;
   g_gather_w0.mutable_data<T>(dim_gather_h, place);
   g_gather_w1.mutable_data<T>(dim_gather_h, place);
   w0.Resize({out_w, 1});
@@ -474,7 +473,7 @@ void BilinearBwdNpu(const framework::ExecutionContext& ctx,
   F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
   F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
 
-  Tensor gx_0, gx_1;
+  phi::DenseTensor gx_0, gx_1;
   gx_0.mutable_data<T>(indim, place);
   gx_1.mutable_data<T>(indim, place);
   h0.Resize({out_h, 1});
@@ -493,10 +492,11 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<phi::DenseTensor>("Out");
 
     auto input_dims = input->dims();
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      4UL,
-                      platform::errors::External(
-                          "NPU Interpolate Kernel only support 4-D Tensor."));
+    PADDLE_ENFORCE_EQ(
+        input_dims.size(),
+        4UL,
+        platform::errors::External(
+            "NPU Interpolate Kernel only support 4-D phi::DenseTensor."));
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
     const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
@@ -552,14 +552,16 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
             scale_w > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_w in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_w));
         PADDLE_ENFORCE_EQ(
             scale_h > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_h in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_h));
       } else {
@@ -704,14 +706,16 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
             scale_w > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_w in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_w));
         PADDLE_ENFORCE_EQ(
             scale_h > 0,
             true,
             platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+                "The scale_h in input 'Scale' phi::DenseTensor of "
+                "Operator(interpolate) "
                 "should be greater than 0, but received value is %d.",
                 scale_h));
       } else {
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 8070527a56a8c..2c2cab61521ef 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -135,12 +135,11 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) {
   LOG(INFO) << loginfos.str();
 }
 
-using Tensor = phi::DenseTensor;
 template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYZN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x, y, z;
+    phi::DenseTensor x, y, z;
     x.Resize({d});
     y.Resize({d});
     z.Resize({d});
@@ -161,7 +160,7 @@ void BenchKernelAXYN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
-    Tensor x, y;
+    phi::DenseTensor x, y;
     x.Resize({d});
     y.Resize({d});
     T* x_data = x.mutable_data<T>(PlaceType());
@@ -177,7 +176,7 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelXRN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x;
+    phi::DenseTensor x;
     RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
     T res;
     BenchAllImpls<KernelTuple, PlaceType>(d, x.data<T>(), &res, d);
@@ -188,7 +187,7 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelXYN() {
   using T = typename KernelTuple::data_type;
   for (int d : TestSizes()) {
-    Tensor x, y;
+    phi::DenseTensor x, y;
     x.Resize({d});
     y.Resize({d});
     T* x_data = x.mutable_data<T>(PlaceType());
@@ -205,7 +204,7 @@ void BenchKernelLSTM() {
     for (int d : TestSizes()) {
       const jit::lstm_attr_t attr(
           d, jit::kVSigmoid, jit::kVTanh, jit::kVTanh, use_peephole);
-      Tensor x, ct_1, ct, ht, wp, checked;
+      phi::DenseTensor x, ct_1, ct, ht, wp, checked;
       x.Resize({4 * d});
       ct_1.Resize({d});
       ct.Resize({d});
@@ -242,7 +241,7 @@ void BenchKernelGRU() {
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
     auto place = PlaceType();
-    Tensor x, ht_1, ht;
+    phi::DenseTensor x, ht_1, ht;
     x.Resize({3 * d});
     ht_1.Resize({d});
     ht.Resize({d});
@@ -269,7 +268,7 @@ void BenchKernelSeqPool() {
       jit::seq_pool_attr_t attr(w, type);
       for (int h : TestSizes()) {
         attr.h = h;
-        Tensor x, y;
+        phi::DenseTensor x, y;
         x.Resize({h * w});
         y.Resize({w});
         RandomVec<T>(h * w, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
@@ -287,7 +286,7 @@ void BenchKernelEmbSeqPool() {
   std::vector<jit::SeqPoolType> pool_types = {jit::SeqPoolType::kSum};
   int64_t tbl_h = 1e4;
   for (int tbl_w : {10, 16, 256}) {
-    Tensor table;
+    phi::DenseTensor table;
     table.Resize({tbl_h, tbl_w});
     RandomVec<T>(tbl_h * tbl_w, table.mutable_data<T>(PlaceType()), -2.f, 2.f);
     const T* table_data = table.data<T>();
@@ -297,7 +296,7 @@ void BenchKernelEmbSeqPool() {
           int64_t out_w = tbl_w * idx_w;
           jit::emb_seq_pool_attr_t attr(
               tbl_h, tbl_w, idx_h, idx_w, out_w, type);
-          Tensor idx, out;
+          phi::DenseTensor idx, out;
           idx.Resize({idx_h, idx_w});
           out.Resize({out_w});
           RandomVec<int64_t>(idx_h * idx_w,
@@ -348,12 +347,12 @@ void BenchKernelSgd() {
   for (int param_h : {1, 1000}) {
     for (int grad_w : {1, 2, 8, 16, 30, 256}) {
       // only benchmark inplace
-      Tensor param;
+      phi::DenseTensor param;
       param.Resize({param_h, grad_w});
       T* param_data = param.mutable_data<T>(PlaceType());
       RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
       for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
-        Tensor grad;
+        phi::DenseTensor grad;
         grad.Resize({rows_size, grad_w});
         std::vector<int64_t> rows =
             UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
@@ -375,7 +374,7 @@ void BenchKernelMatMul() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
       for (int k : TestSizes()) {
-        Tensor a, b, c;
+        phi::DenseTensor a, b, c;
         a.Resize({m * k});
         b.Resize({k * n});
         c.Resize({m * n});
@@ -397,7 +396,7 @@ void BenchKernelSoftmax() {
   using T = typename KernelTuple::data_type;
   for (int bs : {1, 2, 10}) {
     for (int n : TestSizes()) {
-      Tensor x, y;
+      phi::DenseTensor x, y;
       x.Resize({bs, n});
       y.Resize({bs, n});
       RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
@@ -418,7 +417,7 @@ void BenchKernelLayerNorm() {
       for (int x_dim_1 : TestSizes()) {
         int right = x_dim_1;
         int sz = left * right;
-        Tensor x, mean, var, scale, bias, out;
+        phi::DenseTensor x, mean, var, scale, bias, out;
         x.Resize({n, x_dim_0, x_dim_1});
         out.Resize({n, x_dim_0, x_dim_1});
         mean.Resize({n, x_dim_0});
@@ -462,7 +461,7 @@ void BenchKernelCRFDecoding() {
     for (int tag_num : TestSizes()) {
       int x_sz = seq_len * tag_num;
       int w_sz = (tag_num + state_trans_base_idx) * tag_num;
-      Tensor x, w, alpha, track;
+      phi::DenseTensor x, w, alpha, track;
       x.Resize({seq_len, tag_num});
       w.Resize({tag_num + state_trans_base_idx, tag_num});
       alpha.Resize({seq_len, tag_num});
@@ -486,12 +485,12 @@ template <typename KernelTuple, typename PlaceType>
 void BenchKernelVBroadcast() {
   using T = typename KernelTuple::data_type;
   for (int64_t w : {1, 16, 64, 100, 256}) {
-    Tensor x;
+    phi::DenseTensor x;
     x.Resize({w});
     RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
     const T* x_data = x.data<T>();
     for (int h : TestSizes()) {
-      Tensor y;
+      phi::DenseTensor y;
       y.Resize({h * w});
       T* y_data = y.mutable_data<T>(PlaceType());
       BenchAllImpls<KernelTuple, PlaceType>(
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
index f21e939a7b118..760675ea74663 100644
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class KLDivLossNPUKernel : public framework::OpKernel<T> {
  public:
@@ -114,7 +112,7 @@ class KLDivLossGradNPUKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
     auto stream = dev_ctx.stream();
 
-    Tensor loss_grad_transformed;
+    phi::DenseTensor loss_grad_transformed;
     if ("none" == reduction) {
       loss_grad_transformed.ShareDataWith(*loss_grad);
     } else {
diff --git a/paddle/fluid/operators/label_smooth_op_mlu.cc b/paddle/fluid/operators/label_smooth_op_mlu.cc
index 211ffc7fb2cd6..96f629e14df5c 100644
--- a/paddle/fluid/operators/label_smooth_op_mlu.cc
+++ b/paddle/fluid/operators/label_smooth_op_mlu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class LabelSmoothMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
index 529e8564cb19b..71bb1786bd018 100644
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ b/paddle/fluid/operators/label_smooth_op_npu.cc
@@ -18,8 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void LabelSmoothMuls(const platform::Place& place,
                      const aclrtStream& stream,
@@ -70,15 +68,15 @@ class LabelSmoothNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     if (dist_t) {
-      Tensor tmp;
-      Tensor dist;
-      Tensor tmp2;
+      phi::DenseTensor tmp;
+      phi::DenseTensor dist;
+      phi::DenseTensor tmp2;
       LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
       LabelSmoothMuls<T>(place, stream, dist_t, epsilon, &tmp2);
       tmp2.Resize({1, label_dim});
       LabelSmoothAddBroadCast<T>(place, stream, &tmp, &tmp2, out_t);
     } else {
-      Tensor tmp;
+      phi::DenseTensor tmp;
       LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
       LabelSmoothAdds<T>(place, stream, &tmp, (epsilon / label_dim), out_t);
     }
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 3d1bd7490795d..703a3b7506efc 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -33,7 +33,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 1081df4166aac..461d77f324bcf 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 class LayerNormOp : public framework::OperatorWithKernel {
@@ -210,9 +209,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_NOT_NULL(
         var,
         platform::errors::NotFound("Y@GRAD of LayerNorm Op is not found."));
-    const Tensor *t = nullptr;
-    if (var->IsType<Tensor>()) {
-      t = &var->Get<Tensor>();
+    const phi::DenseTensor *t = nullptr;
+    if (var->IsType<phi::DenseTensor>()) {
+      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
     }
diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc
index 7058f9f094923..deb7bb5045eba 100644
--- a/paddle/fluid/operators/layer_norm_op_mlu.cc
+++ b/paddle/fluid/operators/layer_norm_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 template <typename T>
@@ -72,7 +71,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
                                 GetBasePtr(mean),
                                 GetBasePtr(variance));
     } else {
-      Tensor tmp_scale(x->dtype());
+      phi::DenseTensor tmp_scale(x->dtype());
       if (!scale) {
         tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
         FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
@@ -80,7 +79,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
         tmp_scale = *scale;
       }
 
-      Tensor tmp_bias(x->dtype());
+      phi::DenseTensor tmp_bias(x->dtype());
       if (!bias) {
         tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
         FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
@@ -95,7 +94,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
           scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
       cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
 
-      Tensor final_scale(x->dtype());
+      phi::DenseTensor final_scale(x->dtype());
       if (final_scale.dtype() == DataType::FLOAT16 &&
           tmp_scale.dtype() == DataType::FLOAT32) {
         final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -110,7 +109,7 @@ class LayerNormMLUKernel : public framework::OpKernel<T> {
         final_scale = tmp_scale;
       }
 
-      Tensor final_bias(x->dtype());
+      phi::DenseTensor final_bias(x->dtype());
       if (final_bias.dtype() == DataType::FLOAT16 &&
           tmp_bias.dtype() == DataType::FLOAT32) {
         final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -181,7 +180,7 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
         mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
     MLUCnnlTensorDesc dx_desc(*dx);
 
-    Tensor tmp_scale(x->dtype());
+    phi::DenseTensor tmp_scale(x->dtype());
     if (!scale) {
       tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
       FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
@@ -196,7 +195,7 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
     cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
     cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
 
-    Tensor final_scale(x->dtype());
+    phi::DenseTensor final_scale(x->dtype());
     if (final_scale.dtype() == DataType::FLOAT16 &&
         tmp_scale.dtype() == DataType::FLOAT32) {
       final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
@@ -211,14 +210,14 @@ class LayerNormGradMLUKernel : public framework::OpKernel<T> {
       final_scale = tmp_scale;
     }
 
-    Tensor tmp_dscale(x->dtype());
+    phi::DenseTensor tmp_dscale(x->dtype());
     if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
       dscale->mutable_data<T>(place);
       tmp_dscale = *dscale;
     } else {
       tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
     }
-    Tensor tmp_dbias(x->dtype());
+    phi::DenseTensor tmp_dbias(x->dtype());
     if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
       dbias->mutable_data<T>(place);
       tmp_dbias = *dbias;
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index f529bb651c042..5d0313a8f9404 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 using DataLayout = phi::DataLayout;
@@ -75,10 +74,10 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor default_scale(x->type());
+    phi::DenseTensor default_scale(x->type());
     if (!scale) {
       default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       const auto& runner =
@@ -89,10 +88,10 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
     }
 
-    Tensor default_bias(x->type());
+    phi::DenseTensor default_bias(x->type());
     if (!bias) {
       default_bias.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
       const auto& runner =
@@ -104,7 +103,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast scale from LayerNormParamType to T if needed
-    Tensor cast_scale(x->type());
+    phi::DenseTensor cast_scale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(scale->dtype()) ==
@@ -124,7 +123,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast bias from LayerNormParamType to T if needed
-    Tensor cast_bias(x->type());
+    phi::DenseTensor cast_bias(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(bias->dtype()) ==
@@ -147,7 +146,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
 
     // mean should be of  U type
     phi::DenseTensor* tmp_mean = mean;
-    Tensor cast_mean(x->type());
+    phi::DenseTensor cast_mean(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(scale->dtype()) ==
@@ -164,7 +163,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
 
     // same for variance
     phi::DenseTensor* tmp_variance = variance;
-    Tensor cast_variance(x->type());
+    phi::DenseTensor cast_variance(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(scale->dtype()) ==
@@ -273,10 +272,10 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     const_cast<phi::DenseTensor*>(variance)->Resize(
         phi::make_ddim({new_shape}));
 
-    Tensor default_scale(x->type());
+    phi::DenseTensor default_scale(x->type());
     if (!scale) {
       default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      Tensor value(x->type());
+      phi::DenseTensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       const auto& runner =
@@ -288,7 +287,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast scale from LayerNormParamType to T if needed
-    Tensor cast_scale(x->type());
+    phi::DenseTensor cast_scale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(scale->dtype()) ==
@@ -308,7 +307,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast mean from LayerNormParamType to T if needed
-    Tensor cast_mean(x->type());
+    phi::DenseTensor cast_mean(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(mean->dtype()) ==
@@ -328,7 +327,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // cast variance from LayerNormParamType to T if needed
-    Tensor cast_variance(x->type());
+    phi::DenseTensor cast_variance(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         framework::TransToProtoVarType(variance->dtype()) ==
@@ -347,7 +346,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       cast_variance.ShareDataWith(*variance);
     }
 
-    Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
+    phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
     dx = (dx == nullptr) ? &dx_ : dx;
     dscale = (dscale == nullptr) ? &dscale_ : dscale;
     dbias = (dbias == nullptr) ? &dbias_ : dbias;
@@ -361,7 +360,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
 
     // dscale should be of  U type
     phi::DenseTensor* tmp_dscale = dscale;
-    Tensor cast_dscale(x->type());
+    phi::DenseTensor cast_dscale(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(mean->dtype()) ==
@@ -378,7 +377,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
 
     // same for dbias
     phi::DenseTensor* tmp_dbias = dbias;
-    Tensor cast_dbias(x->type());
+    phi::DenseTensor cast_dbias(x->type());
     if (framework::TransToProtoVarType(x->dtype()) ==
             framework::proto::VarType::FP16 &&
         (framework::TransToProtoVarType(mean->dtype()) ==
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index d475eab967d78..2faf47538ffa5 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -26,8 +26,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelFirst(const framework::ExecutionContext& context,
                                  const phi::DenseTensor* input,
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index 28ae524e0a4f9..d14e4c75425c9 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -28,8 +28,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void limit_by_capacity_impl(
     const T* expc, T* cap, T* out, const int n_expert, const int n_worker) {
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
index 47c6bef196be1..ed045fad4a95e 100644
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ b/paddle/fluid/operators/log_loss_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void LogLossAdds(const platform::Place& place,
                  const aclrtStream& stream,
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index 59e0c15678247..87e6d42e98ad5 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index 3f9ec485ce4f8..1c8001e371764 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 1ba6d6e31ecdc..04153eecc3927 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -26,7 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index e9369bcb475cc..f43fccb19e0b6 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -27,14 +27,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using DDim = framework::DDim;
 
 constexpr int64_t kNoPadding = -1;
 
 template <typename InT, typename OutT>
-static std::vector<OutT> CopyIdsToVector(const Tensor &ids) {
+static std::vector<OutT> CopyIdsToVector(const phi::DenseTensor &ids) {
   auto numel = ids.numel();
   const auto *src = ids.data<InT>();
   std::vector<OutT> ret(numel);
@@ -51,7 +50,7 @@ static std::vector<OutT> CopyIdsToVector(const Tensor &ids) {
 template <typename T>
 struct LookupTableV2CPUFunctor {
   LookupTableV2CPUFunctor(const framework::ExecutionContext &context,
-                          const Tensor *ids_t)
+                          const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
@@ -143,7 +142,7 @@ struct LookupTableV2CPUFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
@@ -160,7 +159,7 @@ class LookupTableV2Kernel : public framework::OpKernel<T> {
 template <typename T>
 struct LookupTableV2GradCPUFunctor {
   LookupTableV2GradCPUFunctor(const framework::ExecutionContext &context,
-                              const Tensor *ids_t)
+                              const phi::DenseTensor *ids_t)
       : context_(context), ids_t_(ids_t) {}
 
   template <typename IdT>
@@ -267,7 +266,7 @@ struct LookupTableV2GradCPUFunctor {
 
  private:
   const framework::ExecutionContext &context_;
-  const Tensor *ids_t_;
+  const phi::DenseTensor *ids_t_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
index de9864aeee6a1..c407d91e6b80d 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class LookupTableV2MLUKernel : public framework::OpKernel<T> {
  public:
@@ -84,7 +82,7 @@ class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
             "Number of ids greater than int32_t::max , please check "
             "number of ids in LookupTableV2GradMLUKernel."));
 
-    Tensor ids_int32(ids_t->dtype());
+    phi::DenseTensor ids_int32(ids_t->dtype());
     if (ids_t->dtype() != DataType::INT32) {
       ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc ids_desc(*ids_t);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index d11ef440f8a3f..3dc94d49244b1 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 constexpr int64_t kNoPadding = -1;
 
 template <typename DeviceContext, typename T>
@@ -53,16 +52,16 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
           .AddOutput(*output_t);
       runner.Run();
     } else {
-      Tensor tmp_table_t(table_t->type());
+      phi::DenseTensor tmp_table_t(table_t->type());
       tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
 
-      Tensor index;
+      phi::DenseTensor index;
       index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
       FillNpuTensorWithConstant<int32_t>(&index,
                                          static_cast<int32_t>(padding_idx));
 
       auto updata_dim = phi::make_ddim({1, table_t->dims()[1]});
-      Tensor update;
+      phi::DenseTensor update;
       update.mutable_data<T>(updata_dim, ctx.GetPlace());
       FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
       update.Resize(updata_dim);
@@ -109,7 +108,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
 
     int embedding_dim = table_grad_t->dims()[1];
     if (embedding_dim % 32 == 0) {
-      // NOTE(pangyoki): The embedding_dim of Tensor used in
+      // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in
       // EmbeddingDenseGrad must be an integer multiple of 32.
       int num_weights = table_grad_t->dims()[0];
       const auto &runner =
@@ -137,7 +136,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
                       {{"use_locking", true}});
       runner_scatter.Run(stream);
     } else {
-      Tensor casted_ids_t;
+      phi::DenseTensor casted_ids_t;
       if (framework::TransToProtoVarType(ids_t->dtype()) !=
           framework::proto::VarType::INT32) {
         casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index aa2596e6a22ba..b772aa82e9d7e 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -46,8 +46,6 @@ struct LRNFunctor {
 template <typename DeviceContext, typename T>
 class LRNKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
-
   // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
   // x represents inputs
   // f(x) represents outputs
@@ -141,7 +139,6 @@ struct LRNGradFunctor {
 template <typename DeviceContext, typename T>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& ctx) const override {
     const phi::DenseTensor& x = *ctx.Input<phi::DenseTensor>("X");
     const phi::DenseTensor& out = *ctx.Input<phi::DenseTensor>("Out");
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index dc4f2f1548612..d5ced3edd2add 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ReorderInitState(const DeviceContext& ctx,
                              const phi::DenseTensor& src,
@@ -74,9 +72,9 @@ class LSTMKernel : public framework::OpKernel<T> {
     framework::DDim dims({in_dims[0], frame_size});
 
     if (bias) {
-      Tensor b = *bias;
+      phi::DenseTensor b = *bias;
       b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
       phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
@@ -95,7 +93,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       lstm_value.check_og = nullptr;
     }
     lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
+    phi::DenseTensor ordered_c0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -134,10 +132,10 @@ class LSTMKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor out_t = batch_hidden.Slice(bstart, bend);
+      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
 
       int cur_batch_size = bend - bstart;
 
@@ -160,7 +158,7 @@ class LSTMKernel : public framework::OpKernel<T> {
         // Since the batch computing for LSTM reorders the input sequence
         // according to their length. The initialized hidden state also needs
         // to reorder.
-        Tensor ordered_h0;
+        phi::DenseTensor ordered_h0;
         ReorderInitState<DeviceContext, T>(
             device_ctx, *hidden_t0, order, &ordered_h0, true);
         blas.MatMul(ordered_h0,
@@ -237,7 +235,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     // ordered_h0/c0 is the reordered hidden/cell initialization.
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
     if (c0) {
@@ -328,24 +326,24 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
       lstm_value.gate_value = gate.data<T>();
       lstm_value.state_value = cell.data<T>();
       lstm_value.state_active_value = cell_pre_act.data<T>();
 
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
+      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
       lstm_grad.state_grad = cell_g.data<T>();
       lstm_grad.gate_grad = gate_g.data<T>();
       lstm_grad.output_grad = out_g.data<T>();
 
       if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
         lstm_value.prev_state_value = cell_pre.data<T>();
         lstm_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
@@ -424,9 +422,9 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g) {
       /* backward bias */
-      Tensor b_g = *bias_g;
+      phi::DenseTensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
       phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 8056bf0bd49f2..c26a421966e7b 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -29,7 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using platform::Transform;
 
 template <typename T,
@@ -135,9 +134,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
     framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
 
     if (bias) {
-      Tensor b = *bias;
+      phi::DenseTensor b = *bias;
       b.Resize({bias->numel(), 1});
-      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias = b.Slice(0, 4 * frame_size);
       phi::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
@@ -156,8 +155,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
       lstmp_value.check_og = nullptr;
     }
     lstmp_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-    Tensor ordered_h0;
+    phi::DenseTensor ordered_c0;
+    phi::DenseTensor ordered_h0;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -195,11 +194,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      Tensor proj_t = batch_proj.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate_t = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
+      phi::DenseTensor proj_t = batch_proj.Slice(bstart, bend);
+      phi::DenseTensor cell_t = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
 
       int cur_batch_size = bend - bstart;
 
@@ -349,7 +348,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     // ordered_h0/c0 is the reordered hidden/cell initialization.
     // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
     // initialization.
-    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    phi::DenseTensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
 
     framework::Vector<size_t> order(batch_gate->lod()[2]);
 
@@ -445,8 +444,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
 
-      Tensor cur_proj = batch_proj.Slice(bstart, bend);
-      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      phi::DenseTensor cur_proj = batch_proj.Slice(bstart, bend);
+      phi::DenseTensor proj_g = batch_proj_g.Slice(bstart, bend);
 
       if (proj_clip && proj_clip > 0.0) {
         T* dx_data = proj_g.data<T>();
@@ -472,7 +471,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                        proj_g_dev);
       }
       /* hidden state backwarad */
-      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      phi::DenseTensor out_g = batch_hidden_g.Slice(bstart, bend);
       blas.MatMul(proj_g,
                   false,
                   *proj_weight,
@@ -482,7 +481,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                   static_cast<T>(0.0));
       /* projection weight backward*/
       if (proj_weight_g) {
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        phi::DenseTensor hidden_t = batch_hidden->Slice(bstart, bend);
         blas.MatMul(hidden_t,
                     true,
                     proj_g,
@@ -492,23 +491,23 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                     static_cast<T>(1.0));
       }
 
-      Tensor gate = batch_gate->Slice(bstart, bend);
-      Tensor cell = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      phi::DenseTensor gate = batch_gate->Slice(bstart, bend);
+      phi::DenseTensor cell = batch_cell.Slice(bstart, bend);
+      phi::DenseTensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
       lstmp_value.gate_value = gate.data<T>();
       lstmp_value.state_value = cell.data<T>();
       lstmp_value.state_active_value = cell_pre_act.data<T>();
 
-      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
-      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      phi::DenseTensor gate_g = batch_gate_g.Slice(bstart, bend);
+      phi::DenseTensor cell_g = batch_cell_g.Slice(bstart, bend);
       lstmp_grad.state_grad = cell_g.data<T>();
       lstmp_grad.gate_grad = gate_g.data<T>();
       lstmp_grad.output_grad = out_g.data<T>();
 
       if (n > 0) {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
-        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
-        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        phi::DenseTensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
         lstmp_value.prev_state_value = cell_pre.data<T>();
         lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
       } else {
@@ -589,9 +588,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     }
     if (bias && bias_g) {
       /* backward bias */
-      Tensor b_g = *bias_g;
+      phi::DenseTensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
-      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      phi::DenseTensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
       phi::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
diff --git a/paddle/fluid/operators/masked_select_op_mlu.cc b/paddle/fluid/operators/masked_select_op_mlu.cc
index 50c9973721836..86e4029512b07 100644
--- a/paddle/fluid/operators/masked_select_op_mlu.cc
+++ b/paddle/fluid/operators/masked_select_op_mlu.cc
@@ -39,7 +39,7 @@ class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
             input_dim,
             mask_dim));
 
-    Tensor number(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor number(framework::TransToPhiDataType(VT::INT32));
     void* number_ptr = number.mutable_data<int32_t>({1}, ctx.GetPlace());
 
     out->Resize(mask->dims());
@@ -72,7 +72,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
-    Tensor mask_int32, out_size;
+    phi::DenseTensor mask_int32, out_size;
     std::vector<int32_t> out_size_vec;
     mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
     out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
@@ -118,10 +118,10 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
     paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
     dev_ctx.Wait();
 
-    Tensor mask_int32_tmp;
+    phi::DenseTensor mask_int32_tmp;
     mask_int32_tmp.ShareDataWith(mask_int32);
     mask_int32_tmp.Resize({mask_int32.numel()});
-    Tensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
+    phi::DenseTensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
         indices_int32(framework::TransToPhiDataType(VT::INT32));
     topk_v2_out.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
     indices_int32.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
@@ -145,7 +145,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<MLUDeviceContext>().stream();
 
-    Tensor indices_int32_out;
+    phi::DenseTensor indices_int32_out;
     indices_int32_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
     memory::Copy(ctx.GetPlace(),
                  GetBasePtr(&indices_int32_out),
@@ -154,7 +154,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
                  out_size_vec[0] * sizeof(int32_t),
                  stream);
 
-    Tensor y_grad_tmp_out;
+    phi::DenseTensor y_grad_tmp_out;
     y_grad_tmp_out.mutable_data<T>({out_size_vec[0]}, ctx.GetPlace());
     MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out);
     memory::Copy(ctx.GetPlace(),
@@ -164,7 +164,7 @@ class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
                  out_size_vec[0] * sizeof(T),
                  stream);
 
-    Tensor indices_int32_tmp;
+    phi::DenseTensor indices_int32_tmp;
     indices_int32_tmp.ShareDataWith(indices_int32_out);
     indices_int32_tmp.Resize({out_size_vec[0], 1});
     MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp);
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index facf44725e2b6..3473a051b7324 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const {
@@ -353,7 +352,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
     auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* d_y = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
 
-    Tensor tmp_grad;
+    phi::DenseTensor tmp_grad;
     tmp_grad.Resize(tmp->dims());
     auto* d_tmp_data = tmp_grad.mutable_data<T>(ctx.GetPlace());
     auto* top_diff = d_out->data<T>();
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h
index 72e99222ddffb..6aa5b12ff6778 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.h
+++ b/paddle/fluid/operators/match_matrix_tensor_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 class MatchMatrixTensorOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 0038b25fb42de..0b6dc510f477f 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -26,8 +26,6 @@ namespace operators {
 
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 /*
  * \brief Context projection concatenates features in adjacent time-steps in
  * a sequence. The i-th row of the output is the concatenation of
@@ -117,13 +115,13 @@ class ContextProjectFunctor {
                             : static_cast<int>(lod_level_0[i]);
       input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                static_cast<int>(lod_level_0[i + 1]));
+      phi::DenseTensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                          static_cast<int>(lod_level_0[i + 1]));
 
       sequence_height = static_cast<int>(out_t.dims()[0]);
 
       if (input_row_begin < input_row_end) {
-        Tensor in_t = in.Slice(input_row_begin, input_row_end);
+        phi::DenseTensor in_t = in.Slice(input_row_begin, input_row_end);
 
         std::vector<int64_t> output_shape(
             {sequence_height,
@@ -151,8 +149,9 @@ class ContextProjectFunctor {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
+        phi::DenseTensor out_t =
+            col->Slice(static_cast<int>(lod_level_0[i]),
+                       static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
@@ -167,9 +166,9 @@ class ContextProjectFunctor {
           for (int k = 0; k < padding_rows; ++k) {
             int padding_size =
                 k + context_length < up_pad ? context_length : up_pad - k;
-            Tensor out_t_sub = out_t.Slice(k * context_length,
-                                           k * context_length + padding_size);
-            Tensor w_sub = padding_data->Slice(k, k + padding_size);
+            phi::DenseTensor out_t_sub = out_t.Slice(
+                k * context_length, k * context_length + padding_size);
+            phi::DenseTensor w_sub = padding_data->Slice(k, k + padding_size);
             framework::TensorCopy(
                 w_sub, context.GetPlace(), context, &out_t_sub);
           }
@@ -196,10 +195,10 @@ class ContextProjectFunctor {
             if (padding_begin > 0 || sequence_height == context_start)
               padding_idx = padding_begin + t;
 
-            Tensor out_t_sub = out_t.Slice(
+            phi::DenseTensor out_t_sub = out_t.Slice(
                 (down_pad_begin_row + t) * context_length - padding_size,
                 (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data->Slice(
+            phi::DenseTensor w_sub = padding_data->Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
             framework::TensorCopy(
                 w_sub, context.GetPlace(), context, &out_t_sub);
@@ -250,13 +249,14 @@ class ContextProjectGradFunctor {
                               : static_cast<int>(lod_level_0[i]);
         input_row_end = static_cast<int>(lod_level_0[i + 1]);
 
-        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                  static_cast<int>(lod_level_0[i + 1]));
+        phi::DenseTensor out_t =
+            col->Slice(static_cast<int>(lod_level_0[i]),
+                       static_cast<int>(lod_level_0[i + 1]));
 
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
         if (input_row_begin < input_row_end) {
-          Tensor in_t = in.Slice(input_row_begin, input_row_end);
+          phi::DenseTensor in_t = in.Slice(input_row_begin, input_row_end);
 
           std::vector<int64_t> output_shape(
               {sequence_height,
@@ -283,8 +283,9 @@ class ContextProjectGradFunctor {
         for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
           if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
-          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
-                                    static_cast<int>(lod_level_0[i + 1]));
+          phi::DenseTensor out_t =
+              col->Slice(static_cast<int>(lod_level_0[i]),
+                         static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
           out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
@@ -297,9 +298,9 @@ class ContextProjectGradFunctor {
             for (int k = 0; k < padding_rows; ++k) {
               int padding_size =
                   k + context_length < up_pad ? context_length : up_pad - k;
-              Tensor out_t_sub = out_t.Slice(k * context_length,
-                                             k * context_length + padding_size);
-              Tensor w_sub = padding_data->Slice(k, k + padding_size);
+              phi::DenseTensor out_t_sub = out_t.Slice(
+                  k * context_length, k * context_length + padding_size);
+              phi::DenseTensor w_sub = padding_data->Slice(k, k + padding_size);
               blas.AXPY(w_sub.numel(),
                         static_cast<T>(1),
                         out_t_sub.data<T>(),
@@ -329,10 +330,10 @@ class ContextProjectGradFunctor {
               if (padding_begin > 0 || sequence_height == context_start)
                 padding_idx = padding_begin + t;
 
-              Tensor out_t_sub = out_t.Slice(
+              phi::DenseTensor out_t_sub = out_t.Slice(
                   (down_pad_begin_row + t) * context_length - padding_size,
                   (down_pad_begin_row + t) * context_length);
-              Tensor w_sub = padding_data->Slice(
+              phi::DenseTensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
               blas.AXPY(w_sub.numel(),
                         static_cast<T>(1),
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 1ba2d8f18ca1c..f4198acfd830c 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -55,9 +55,9 @@ static void CheckEighResult(const int batch, const int info) {
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors);
 };
@@ -69,9 +69,9 @@ template <typename T>
 struct MatrixEighFunctor<phi::CPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
@@ -80,7 +80,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
     auto dito =
         math::DeviceIndependenceTensorOperations<phi::CPUContext, T>(ctx);
 
-    Tensor input_trans;
+    phi::DenseTensor input_trans;
     // lapack is a column-major storge, transpose make the input to
     // have a continuous memory layout
     input_trans = dito.Transpose(input);
@@ -124,7 +124,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
     lwork = std::max<int>(1, static_cast<int>(lwork_opt));
     liwork = std::max<int>(1, iwork_opt);
 
-    Tensor rwork_tensor;
+    phi::DenseTensor rwork_tensor;
     ValueType *rwork_data = nullptr;
 
     // complex type
@@ -134,7 +134,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
       rwork_data = rwork_tensor.mutable_data<ValueType>(
           phi::make_ddim({lrwork}), ctx.GetPlace());
     }
-    Tensor iwork_tensor, work_tensor;
+    phi::DenseTensor iwork_tensor, work_tensor;
     auto *iwork_data = iwork_tensor.mutable_data<int>(phi::make_ddim({liwork}),
                                                       ctx.GetPlace());
     auto *work_data =
@@ -179,9 +179,9 @@ template <typename T>
 struct MatrixEighFunctor<phi::GPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
-                  const Tensor &input,
-                  Tensor *eigen_values,
-                  Tensor *eigen_vectors,
+                  const phi::DenseTensor &input,
+                  phi::DenseTensor *eigen_values,
+                  phi::DenseTensor *eigen_vectors,
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
@@ -190,7 +190,7 @@ struct MatrixEighFunctor<phi::GPUContext, T> {
     auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto dito =
         math::DeviceIndependenceTensorOperations<phi::GPUContext, T>(ctx);
-    Tensor input_trans;
+    phi::DenseTensor input_trans;
     input_trans = dito.Transpose(input);
     auto *input_vector = input_trans.data<T>();
     auto &dims = input.dims();
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index e3cc5a5741b02..0c6b49729546c 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -31,8 +31,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __device__ T gpu_adjust_prob(const T prob,
                              const int num_samples,
@@ -146,7 +144,7 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
 
   int s_size = num_samples;
   framework::DDim s_dim{s_size};
-  Tensor s;
+  phi::DenseTensor s;
   int64_t* s_data = s.mutable_data<int64_t>(s_dim, platform::CPUPlace());
 
   math::LogUniformSampler sampler(dict_size, seed);
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 2464ac25186f0..7c60be6841552 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -27,8 +27,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
-
 /* UNDERSTAND: utility function to adjust probability for unique sampling,
 return whatever as it is if not using unique samping */
 template <typename T>
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 65d4a479a4988..53b3b632dd4be 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -24,7 +24,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -405,7 +404,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
     }
     auto& place = *context.eigen_device();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor out_t = output->Slice(i, i + 1);
+      phi::DenseTensor out_t = output->Slice(i, i + 1);
       int64_t w = input.numel() / input.dims()[0];
       if (lod[i] == lod[i + 1]) {
         for (int j = 0; j < w; ++j) {
@@ -413,7 +412,7 @@ class SequencePoolFunctor<phi::CPUContext, T> {
         }
         continue;
       }
-      Tensor in_t =
+      phi::DenseTensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_e = EigenMatrix<T>::From(in_t, phi::make_ddim({h, w}));
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index c70e1e3e7405a..b7a9b9a19c970 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -23,7 +23,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-using Tensor = phi::DenseTensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 template <typename T>
@@ -42,7 +41,7 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
   }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
   // fill 1 at unused dims
   if (cudnn_tensor_dims.size() <= 2) {
     cudnn_tensor_dims.resize(4, 1);
@@ -95,7 +94,7 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
   if (cudnn_tensor_dims.size() == 5) {
     layout = DataLayout::kNCDHW;
   }
-  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
   // fill 1 at unused dims
   if (cudnn_tensor_dims.size() <= 2) {
     cudnn_tensor_dims.resize(4, 1);
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index 3b467448ac09d..22bdc48768dae 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -20,7 +20,6 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = phi::DenseTensor;
 using Node = paddle::operators::math::TreeNode;
 template <typename T>
 __global__ void tree2col(const T* eta,
@@ -65,7 +64,7 @@ class Tree2ColFunctor<phi::GPUContext, T> {
     auto feature_dims = node_features.dims();
     phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
-    Tensor EdgeSet_cpu;
+    phi::DenseTensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
@@ -83,7 +82,7 @@ class Tree2ColFunctor<phi::GPUContext, T> {
     }
 
     size_t patch_size = processing_list.size();
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    phi::DenseTensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
     int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
                                            cpu_place);
     T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
@@ -142,7 +141,7 @@ class Col2TreeFunctor<phi::GPUContext, T> {
     auto output_dims = patch_grad.dims();
     phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
-    Tensor EdgeSet_cpu;
+    phi::DenseTensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
     int64_t output_size = output_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(output_size);
@@ -168,7 +167,7 @@ class Col2TreeFunctor<phi::GPUContext, T> {
       total_size += tmp.size();
     }
 
-    Tensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
+    phi::DenseTensor node_cpu, node_gpu, eta_cpu, eta_gpu, index_cpu, index_gpu;
     int* node = node_cpu.mutable_data<int>({static_cast<int64_t>(total_size)},
                                            cpu_place);
     T* eta = eta_cpu.mutable_data<T>({static_cast<int64_t>(total_size * 3)},
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
index e55996903a7d1..84d2f031d4bcb 100644
--- a/paddle/fluid/operators/matmul_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
                 const phi::DenseTensor& X,
@@ -183,7 +181,7 @@ class MatMulMLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -281,7 +279,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -335,7 +333,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
 
     if (dX) {
-      Tensor dx_temp(X->type());
+      phi::DenseTensor dx_temp(X->type());
       if (x_dims != x_bcast_dims) {
         dx_temp.Resize(phi::make_ddim(x_bcast_dims));
       } else {
@@ -356,7 +354,7 @@ class MatMulGradMLUKernel : public framework::OpKernel<T> {
     }
 
     if (dY) {
-      Tensor dy_temp(Y->type());
+      phi::DenseTensor dy_temp(Y->type());
       if (y_dims != y_bcast_dims) {
         dy_temp.Resize(phi::make_ddim(y_bcast_dims));
       } else {
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
index 31b352b90f6a8..8ab395e8aa3e4 100644
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -35,7 +34,7 @@ static void Mul(const framework::ExecutionContext& ctx,
     const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
     runner_dx.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
     runner_dx.Run(stream);
@@ -59,7 +58,7 @@ static void Dot(const framework::ExecutionContext& ctx,
     const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
     out_temp_runner.Run(stream);
@@ -89,7 +88,7 @@ static void MatMul2D(const framework::ExecutionContext& ctx,
                     {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner =
         NpuOpRunner("MatMul",
@@ -123,7 +122,7 @@ static void MatMulND(const framework::ExecutionContext& ctx,
                     {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
     runner.Run(stream);
   } else {
-    Tensor Out_temp(Out->dtype());
+    phi::DenseTensor Out_temp(Out->dtype());
     Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
     const auto& out_temp_runner =
         NpuOpRunner("BatchMatMul",
@@ -200,7 +199,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -268,7 +267,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->dtype());
+    phi::DenseTensor x_temp_brd(X->dtype());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -283,7 +282,7 @@ class MatMulNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->dtype());
+    phi::DenseTensor y_temp_brd(Y->dtype());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -332,7 +331,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
 
     // Case 1: [K] x [K] = [1]
     if (x_ndim == 1 && y_ndim == 1) {
-      Tensor dout_temp(dOut->dtype());
+      phi::DenseTensor dout_temp(dOut->dtype());
       dout_temp.Resize(X->dims());
       dout_temp.mutable_data<T>(ctx.GetPlace());
       NpuOpRunner runner;
@@ -352,7 +351,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -434,7 +433,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->dtype());
+    phi::DenseTensor x_temp_brd(X->dtype());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -449,7 +448,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->dtype());
+    phi::DenseTensor y_temp_brd(Y->dtype());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -480,7 +479,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
                       alpha);
         }
       } else {
-        Tensor dx_temp(X->dtype());
+        phi::DenseTensor dx_temp(X->dtype());
         dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
         if (transpose_x) {
           MatMulND<T>(ctx,
@@ -520,7 +519,7 @@ class MatMulGradNPUKernel : public framework::OpKernel<T> {
                       alpha);
         }
       } else {
-        Tensor dy_temp(Y->dtype());
+        phi::DenseTensor dy_temp(Y->dtype());
         dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
         if (transpose_y) {
           MatMulND<T>(ctx,
diff --git a/paddle/fluid/operators/matmul_v2_op_mlu.cc b/paddle/fluid/operators/matmul_v2_op_mlu.cc
index 134819b7920a0..db7a92409bf6c 100644
--- a/paddle/fluid/operators/matmul_v2_op_mlu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 static void Mul(const framework::ExecutionContext& ctx,
                 const phi::DenseTensor& X,
@@ -193,7 +191,7 @@ class MatMulV2MLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -290,7 +288,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -344,7 +342,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
     std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
 
     if (dX) {
-      Tensor dx_temp(X->type());
+      phi::DenseTensor dx_temp(X->type());
       if (x_dims != x_bcast_dims) {
         dx_temp.Resize(phi::make_ddim(x_bcast_dims));
       } else {
@@ -375,7 +373,7 @@ class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
               ctx, x_temp, dout_temp, dY, !trans_x, false);
         }
       } else {
-        Tensor dy_temp(Y->type());
+        phi::DenseTensor dy_temp(Y->type());
         if (y_dims != y_bcast_dims) {
           dy_temp.Resize(phi::make_ddim(y_bcast_dims));
         } else {
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index 4df3de71134ed..715171452a987 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
@@ -67,7 +66,7 @@ void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
                                    const bool trans_x,
                                    const bool trans_y) {
   Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
-  Tensor x_fp32, y_fp32, out_fp32;
+  phi::DenseTensor x_fp32, y_fp32, out_fp32;
   x_fp32.Resize(X.dims());
   y_fp32.Resize(Y.dims());
   out_fp32.Resize(Out->dims());
@@ -173,7 +172,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp;
+    phi::DenseTensor x_temp, y_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     if (x_ndim == 1) {
@@ -239,7 +238,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->type());
+    phi::DenseTensor x_temp_brd(X->type());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -254,7 +253,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->type());
+    phi::DenseTensor y_temp_brd(Y->type());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -295,7 +294,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
 
     // Case 1: [K] x [K] = [1]
     if (x_ndim == 1 && y_ndim == 1) {
-      Tensor dout_temp(dOut->type());
+      phi::DenseTensor dout_temp(dOut->type());
       dout_temp.Resize(X->dims());
       dout_temp.mutable_data<T>(ctx.GetPlace());
       NpuOpRunner runner;
@@ -319,7 +318,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
     }
 
     // Resize dim 1 to 2
-    Tensor x_temp, y_temp, dout_temp;
+    phi::DenseTensor x_temp, y_temp, dout_temp;
     x_temp.ShareDataWith(*X);
     y_temp.ShareDataWith(*Y);
     dout_temp.ShareDataWith(*dOut);
@@ -396,7 +395,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
     std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
     std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
 
-    Tensor x_temp_brd(X->type());
+    phi::DenseTensor x_temp_brd(X->type());
     if (x_dims == x_broadcast_dims) {
       x_temp_brd.ShareDataWith(*X);
       x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
@@ -411,7 +410,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           .Run(stream);
     }
 
-    Tensor y_temp_brd(Y->type());
+    phi::DenseTensor y_temp_brd(Y->type());
     if (y_dims == y_broadcast_dims) {
       y_temp_brd.ShareDataWith(*Y);
       y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
@@ -434,7 +433,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
         }
       } else {
-        Tensor dx_temp(X->type());
+        phi::DenseTensor dx_temp(X->type());
         dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
         if (trans_x) {
           MatMulND<T>(
@@ -454,7 +453,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
         }
       } else {
-        Tensor dy_temp(Y->type());
+        phi::DenseTensor dy_temp(Y->type());
         dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
         if (trans_y) {
           MatMulND<T>(
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index 7681af011e663..9be97f5ba958e 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T,
           int D,
@@ -56,9 +55,9 @@ class MeanIoUKernel : public framework::OpKernel<T> {
     auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
 
     // Tmp tensor
-    Tensor denominator;
-    Tensor valid_count;
-    Tensor iou_sum;
+    phi::DenseTensor denominator;
+    phi::DenseTensor valid_count;
+    phi::DenseTensor iou_sum;
 
     // get data ptr of tmp tensor
     int* denominator_data = denominator.mutable_data<int>(
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
index 8fea989941c88..e9266b30fcd01 100644
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -20,8 +20,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class MeanMLUKernel : public framework::OpKernel<T> {
  public:
@@ -79,12 +77,13 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto output_grad =
         context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(output_grad->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Mean Gradient Input Tensor len should be 1. But "
-                          "received Out@Grad's elements num is %d.",
-                          output_grad->numel()));
+    PADDLE_ENFORCE_EQ(
+        output_grad->numel(),
+        1,
+        platform::errors::InvalidArgument(
+            "Mean Gradient Input phi::DenseTensor len should be 1. But "
+            "received Out@Grad's elements num is %d.",
+            output_grad->numel()));
     auto input_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(context.GetPlace());
@@ -102,7 +101,7 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
     }
 
     // means
-    Tensor mean_var(output_grad->dtype());
+    phi::DenseTensor mean_var(output_grad->dtype());
     mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
     MLUCnnlTensorDesc mean_var_desc(
         mean_var, CNNL_LAYOUT_ARRAY, ToCnnlDataType(mean_var.dtype()));
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index 99fd77dd7f7df..3417045690ff6 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class MeanNPUKernel : public framework::OpKernel<T> {
  public:
@@ -51,31 +49,32 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
 
     auto grad = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
 
-    PADDLE_ENFORCE_EQ(grad->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Mean Gradient Input Tensor len should be 1. But "
-                          "received Out@Grad's elements num is %d.",
-                          grad->numel()));
+    PADDLE_ENFORCE_EQ(
+        grad->numel(),
+        1,
+        platform::errors::InvalidArgument(
+            "Mean Gradient Input phi::DenseTensor len should be 1. But "
+            "received Out@Grad's elements num is %d.",
+            grad->numel()));
 
     auto IG = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     IG->mutable_data<T>(context.GetPlace());
 
     // ones
-    Tensor ones(grad->dtype());
+    phi::DenseTensor ones(grad->dtype());
     ones.mutable_data<T>(IG->dims(), context.GetPlace());
     const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
     runner_ones.Run(stream);
 
     // means
-    Tensor mean_tensor(grad->dtype());
+    phi::DenseTensor mean_tensor(grad->dtype());
     mean_tensor.Resize({1});
     mean_tensor.mutable_data<T>(context.GetPlace());
     FillNpuTensorWithConstant<T>(
         &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
 
     // means mul ones
-    Tensor mean_ma(grad->dtype());
+    phi::DenseTensor mean_ma(grad->dtype());
     mean_ma.Resize(IG->dims());
     mean_ma.mutable_data<T>(context.GetPlace());
     const auto& runner_mul_1 =
diff --git a/paddle/fluid/operators/meshgrid_op_mlu.cc b/paddle/fluid/operators/meshgrid_op_mlu.cc
index 76beb021bc654..f0103afbb0bc5 100644
--- a/paddle/fluid/operators/meshgrid_op_mlu.cc
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
@@ -24,12 +24,12 @@ class MeshgridMLUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<phi::DenseTensor>("X");
     auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        (ins.size() > 1) && (ins.size() < 7),
-        true,
-        platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 2 and 6, but only received d% .",
-            ins.size()));
+    PADDLE_ENFORCE_EQ((ins.size() > 1) && (ins.size() < 7),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "Excepted phi::DenseTensor numbers between 2 and 6, "
+                          "but only received d% .",
+                          ins.size()));
 
     int64_t size = ins.size();
     std::vector<int64_t> shape(size);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
index ec78fb09eab30..b66966ac64b90 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc
@@ -36,8 +36,8 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     }
 
     // cast `indices` or `label` if their type is not INT32
-    Tensor indices_int32(framework::TransToPhiDataType(VT::INT32));
-    Tensor label_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
+    phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
     auto indices_type = framework::TransToProtoVarType(indices->type());
     if (indices_type != VT::INT32) {
       PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
@@ -89,7 +89,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
     // equal
     MLUCnnlTensorDesc indices_int32_desc(indices_int32);
     MLUCnnlTensorDesc label_int32_desc(label_int32);
-    Tensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
+    phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
     equal_tensor.Resize(indices->dims());
     equal_tensor.mutable_data<bool>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
@@ -103,7 +103,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_tensor));
 
     // cast equal
-    Tensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
     equal_fp32.Resize(indices->dims());
     equal_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
@@ -117,7 +117,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
 
     // [correct]
     // reduce_max
-    Tensor correct_max(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
     correct_max.Resize(phi::make_ddim({num_samples}));
     correct_max.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_max_desc(correct_max);
@@ -140,7 +140,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                     GetBasePtr(&correct_max));
 
     // reduce_sum
-    Tensor correct_sum(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
     correct_sum.Resize(correct->dims());
     correct_sum.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc correct_sum_desc(correct_sum);
@@ -183,7 +183,7 @@ class AccuracyMLUKernel : public framework::OpKernel<T> {
                   GetBasePtr(total));
 
     // use `total` of type `float32` for calculating accuracy
-    Tensor total_fp32(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
     total_fp32.Resize(total->dims());
     total_fp32.mutable_data<float>(ctx.GetPlace());
     MLUCnnlTensorDesc total_fp32_desc(total_fp32);
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 4c83071264a42..737228902b6e7 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class AccuracyXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
index 55be510dcd237..bec8bba09ad1a 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index d27234344ff27..146ee52fc62ff 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -26,7 +26,6 @@ namespace operators {
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 using dnnl::stream;
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index be965c4abb895..c2556b6bfc41d 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,16 +20,15 @@ limitations under the License. */
 namespace {
 using dnnl::memory;
 using paddle::framework::ExecutionContext;
+using paddle::framework::GradVarName;
 using paddle::platform::MatMulV2MKLDNNHandler;
 using phi::OneDNNContext;
 using phi::vectorize;
 using phi::funcs::OneDNNGetDataType;
-using Tensor = phi::DenseTensor;
-using paddle::framework::GradVarName;
 
 // Reshape a rank-3 tensor from P x M x N to (P * M) x N.
 // Identity op if the tensor is not of rank 3.
-static Tensor FoldOuterDims(const Tensor &input) {
+static phi::DenseTensor FoldOuterDims(const phi::DenseTensor &input) {
   auto output = input;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
@@ -42,14 +41,14 @@ static Tensor FoldOuterDims(const Tensor &input) {
 // (Warning: This requires transposing data and writes into new memory.)
 // Identity op if the tensor is not of rank 3.
 template <typename T>
-static Tensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
-                                   const Tensor *input) {
+static phi::DenseTensor FoldFirstAndLastDims(const OneDNNContext &dev_ctx,
+                                             const phi::DenseTensor *input) {
   auto input_dims = vectorize(input->dims());
   if (input_dims.size() != 3) {
     return *input;
   }
 
-  Tensor output;
+  phi::DenseTensor output;
   output.Resize({input_dims[1], input_dims[0], input_dims[2]});
 
   auto output_dims = vectorize(output.dims());
@@ -89,11 +88,11 @@ class MatMulMKLDNNHandler
  public:
   MatMulMKLDNNHandler(const dnnl::engine engine,
                       paddle::platform::Place cpu_place,
-                      Tensor *x,
+                      phi::DenseTensor *x,
                       bool trans_x,
-                      Tensor *y,
+                      phi::DenseTensor *y,
                       bool trans_y,
-                      Tensor *out,
+                      phi::DenseTensor *out,
                       float scale)
       : phi::funcs::OneDNNHandlerNoCachingT<XT, dnnl::matmul>(engine,
                                                               cpu_place) {
@@ -129,7 +128,7 @@ class MatMulMKLDNNHandler
     this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
   }
 
-  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor *input) {
+  std::shared_ptr<memory> AcquireWeightsMemory(const phi::DenseTensor *input) {
     const YT *input_data = input->data<YT>();
     return this->AcquireMemoryFromPrimitive(
         this->fwd_pd_->weights_desc(),
@@ -176,11 +175,10 @@ class MatMulMKLDNNHandler
     // We cannot use base AcquireDstMemory as it makes an allocation request
     // base on DST memory primitive size. This is fine in general, but in MatMul
     // we have primitive that covers only one batch of Data and then shift
-    // pointer for every new batch. Hence Tensor size is bigger that dst memory
-    // primitive size. So would we request less memory that is there and it
-    // triggers an
-    // assertion.  So as there is no 'any' format here we can leave default size
-    // of Tensor as computed in ComputeInferShape
+    // pointer for every new batch. Hence phi::DenseTensor size is bigger that
+    // dst memory primitive size. So would we request less memory that is there
+    // and it triggers an assertion.  So as there is no 'any' format here we can
+    // leave default size of phi::DenseTensor as computed in ComputeInferShape
     OT *ptr = output->mutable_data<OT>(this->place_);
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -199,7 +197,7 @@ class MatMulMKLDNNHandler
  * If transposed, `H,W` will be swapped.
  */
 static void ReshapeTensorToMatrixSequence(
-    Tensor *x, const phi::funcs::MatDescriptor &descriptor) {
+    phi::DenseTensor *x, const phi::funcs::MatDescriptor &descriptor) {
   int64_t h, w;
   h = descriptor.height_;
   w = descriptor.width_;
@@ -227,8 +225,11 @@ static void ReshapeTensorToMatrixSequence(
  * If any of `X` and `Y` has batch size BatchSize, the out will have the
  * BatchSize.
  */
-static void ReshapeXYOutToMatrixSequence(
-    Tensor *x, Tensor *y, Tensor *out, bool trans_x, bool trans_y) {
+static void ReshapeXYOutToMatrixSequence(phi::DenseTensor *x,
+                                         phi::DenseTensor *y,
+                                         phi::DenseTensor *out,
+                                         bool trans_x,
+                                         bool trans_y) {
   auto x_dim = phi::funcs::RowMatrixDimsFromVector(x->dims());
   auto y_dim = phi::funcs::ColumnMatrixDimsFromVector(y->dims());
   auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
@@ -326,13 +327,13 @@ bool IsOutputFused(const ExecutionContext &ctx) {
 template <typename T, typename T_out>
 void ExecuteMatMulV2(const ExecutionContext &ctx,
                      const dnnl::engine onednn_engine,
-                     const Tensor *x,
+                     const phi::DenseTensor *x,
                      const std::vector<int64_t> &x_dims,
                      bool trans_x,
-                     const Tensor *y,
+                     const phi::DenseTensor *y,
                      const std::vector<int64_t> &y_dims,
                      bool trans_y,
-                     Tensor *out) {
+                     phi::DenseTensor *out) {
   std::vector<int64_t> x_strides_override = GetInputStrides(ctx, "X");
   std::vector<int64_t> y_strides_override = GetInputStrides(ctx, "Y");
   MatMulV2MKLDNNHandler<T, T, T_out> handler(ctx,
@@ -471,7 +472,7 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                            const std::vector<int64_t> &y_dims,
                            std::vector<int64_t> *x_bd_dims,
                            std::vector<int64_t> *y_bd_dims,
-                           Tensor *out) const {
+                           phi::DenseTensor *out) const {
     if (x_dims.size() == 1) {
       (*x_bd_dims)[(*x_bd_dims).size() - 1] = x_dims[0];
     } else if (x_dims.size() == 2) {
@@ -501,7 +502,7 @@ class MatMulMKLDNNKernel : public paddle::framework::OpKernel<T> {
                 (*y_bd_dims)[i] == 1,
             true,
             paddle::platform::errors::InvalidArgument(
-                "Tensor dimensions are incorrect for broadcasting."
+                "phi::DenseTensor dimensions are incorrect for broadcasting."
                 "Dimensions in X and Y must be same or equal to 1, but "
                 "received x_dim[%d]=%d and y_dims[%d]= %d",
                 i,
@@ -649,7 +650,7 @@ class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) &&
                         out->dims().size() == 2;
 
-    Tensor x_combined, y_combined;
+    phi::DenseTensor x_combined, y_combined;
     if (!need_combine) {
       x_combined = *x;
       y_combined = *y;
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 098623ea52466..c23f247c9d212 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -25,7 +25,6 @@ namespace operators {
 using dnnl::memory;
 using dnnl::primitive;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 using dnnl::stream;
 using phi::DataLayout;
 
diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
index 36498e60f4e54..a9408ad38e3a1 100644
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -25,7 +25,6 @@ namespace operators {
 
 using dnnl::memory;
 using dnnl::reorder;
-using Tensor = phi::DenseTensor;
 
 namespace {
 
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index ff2484c7ced38..0c2b439b3e510 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -40,7 +40,7 @@ static std::vector<int> extract_shape(
         tensor->dims(),
         phi::make_ddim({1}),
         platform::errors::InvalidArgument(
-            "If the element type of 'shape' in ReshapeOp is Tensor, "
+            "If the element type of 'shape' in ReshapeOp is phi::DenseTensor, "
             "the element's shape must be [1]. But received the element's shape "
             "is [%s]",
             tensor->dims()));
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 2c5b269c3923b..077107dca68f3 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using phi::DataLayout;
 using phi::OneDNNContext;
 
@@ -37,8 +36,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& dnnl_engine = dev_ctx.GetEngine();
     std::vector<int> transpose_axis = ctx.Attr<std::vector<int>>("axis");
     int ndims = transpose_axis.size();
-    const phi::DenseTensor* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
+    const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
+    auto* out = ctx.Output<phi::DenseTensor>("Out");
 
     auto& astream = OneDNNContext::tls().get_stream();
 
@@ -122,8 +121,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL TransposeGrad must use CPUPlace"));
 
-    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const auto* dout =
+        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     if (!dx) return;
     auto& dev_ctx = ctx.template device_context<OneDNNContext>();
     const auto& dnnl_engine = dev_ctx.GetEngine();
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index d205bc2b2554d..09b1551086fab 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -386,7 +386,7 @@ MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
       mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  const mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype) {
   auto dims = phi::vectorize<int>(tensor.dims());
@@ -407,11 +407,11 @@ MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
   }
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor)
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor)
     : MLUOpTensorDesc(
           tensor, MLUOP_LAYOUT_ARRAY, ToMluOpDataType(tensor.dtype())) {}
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype,
                                  int position)
@@ -420,7 +420,7 @@ MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
       mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
 }
 
-MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+MLUOpTensorDesc::MLUOpTensorDesc(const phi::DenseTensor& tensor,
                                  mluOpTensorLayout_t layout,
                                  const mluOpDataType_t tensor_dtype,
                                  int position,
@@ -562,7 +562,7 @@ const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const {
   return mlu_generator;
 }
 
-Tensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
+phi::DenseTensor& MLUCnnlRandomGeneratorDesc::get_state() { return mlu_state; }
 
 MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() {
   if (mlu_generator) {
@@ -953,7 +953,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -981,7 +981,7 @@ MLURNNDesc::~MLURNNDesc() {
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size));
 
-  Tensor workspace(paddle::experimental::DataType::INT8);
+  phi::DenseTensor workspace(paddle::experimental::DataType::INT8);
   workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
   void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
 
@@ -1011,7 +1011,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, in0_desc, in1_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1067,7 +1067,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_quant_desc, output_desc, local_size, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1104,7 +1104,7 @@ MLURNNDesc::~MLURNNDesc() {
 
   // use ctx allocate interface for profiling purpose
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1580,7 +1580,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, in0_desc, in1_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1634,7 +1634,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, a_desc, b_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1665,7 +1665,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetAxWorkspaceSize(handle, alpha_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1754,7 +1754,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1803,7 +1803,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1831,7 +1831,7 @@ MLURNNDesc::~MLURNNDesc() {
   PADDLE_ENFORCE_MLU_SUCCESS(
       cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size));
 
-  Tensor workspace(paddle::experimental::DataType::INT8);
+  phi::DenseTensor workspace(paddle::experimental::DataType::INT8);
   workspace.Resize(framework::DDim({static_cast<int64_t>(workspace_size)}));
   void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace());
 
@@ -1947,7 +1947,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -1979,7 +1979,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, condition_desc, then_desc, else_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2023,7 +2023,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2162,7 +2162,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, pool_mode, output_w, output_h, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2217,7 +2217,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, pool_mode, output_shape[2], output_shape[1], &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2270,7 +2270,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, data_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2359,7 +2359,7 @@ MLURNNDesc::~MLURNNDesc() {
                                              &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2433,7 +2433,7 @@ MLURNNDesc::~MLURNNDesc() {
 
   size_t workspace_size = 0;
   void* workspace_ptr = nullptr;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   if (need_workspace) {
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetReduceOpWorkspaceSize(
         handle, input_desc, output_desc, reduction_desc, &workspace_size));
@@ -2473,7 +2473,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2502,7 +2502,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2530,7 +2530,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetMaximumWorkspaceSize(handle, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2558,7 +2558,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetMinimumWorkspaceSize(handle, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2587,7 +2587,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2617,7 +2617,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2647,7 +2647,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2677,7 +2677,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -2944,7 +2944,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetDynamicStitchWorkspaceSize(handle, size, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3203,7 +3203,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetNmsWorkspaceSize_v2(handle, confidence_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3252,7 +3252,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3427,7 +3427,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetLayerNormOpWorkspaceSize(handle, axis, x_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3499,7 +3499,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3565,7 +3565,7 @@ MLURNNDesc::~MLURNNDesc() {
                                              &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3708,7 +3708,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlMakeFusedOpsPlan(handle, fusion_plan, cparam_pack, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3773,7 +3773,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                   &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3839,7 +3839,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                   &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3899,7 +3899,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                     &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -3967,7 +3967,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                     &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4021,7 +4021,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4079,7 +4079,7 @@ MLURNNDesc::~MLURNNDesc() {
                                           &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4135,7 +4135,7 @@ MLURNNDesc::~MLURNNDesc() {
                                             &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4207,7 +4207,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, matmul_desc, a_desc, b_desc, output_desc, algo, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4295,7 +4295,7 @@ MLURNNDesc::~MLURNNDesc() {
                                               &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4399,7 +4399,7 @@ MLURNNDesc::~MLURNNDesc() {
                                                    &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4450,7 +4450,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, perm_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4517,7 +4517,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetWhereWorkspaceSize(handle, num_true_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4593,7 +4593,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input1_desc, input2_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4623,7 +4623,7 @@ MLURNNDesc::~MLURNNDesc() {
       cnnlGetQRWorkspaceSize(handle, a_desc, some, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4667,7 +4667,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4705,7 +4705,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, target_desc, weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4742,7 +4742,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, x_desc, algorithm, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4778,7 +4778,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, x_desc, algorithm, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4857,7 +4857,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, diff_desc, output_desc, scale_grad_by_freq, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -4903,7 +4903,7 @@ MLURNNDesc::~MLURNNDesc() {
           "MLU RNNForward failed. x_desc initializing failed."));
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size, reservespace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes(
       handle, rnn_desc, x_desc, &workspace_size, &reservespace_size));
   workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
@@ -4967,7 +4967,7 @@ MLURNNDesc::~MLURNNDesc() {
           "MLU RNNForward failed. x_desc initializing failed."));
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes(
       handle, rnn_desc, x_desc, &workspace_size, &reservespace_size));
   workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
@@ -5028,7 +5028,7 @@ MLURNNDesc::~MLURNNDesc() {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   size_t workspace_size;
-  Tensor workspace;
+  phi::DenseTensor workspace;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetMaskedWorkspaceSize(handle,
                                                         masked_mode,
                                                         input_desc,
@@ -5075,7 +5075,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, weight_desc, pos_weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -5119,7 +5119,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, target_desc, weight_desc, pos_weight_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
@@ -5227,7 +5227,7 @@ MLURNNDesc::~MLURNNDesc() {
       handle, input_desc, grid_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
-  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+  phi::DenseTensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 8fbaade9dc01b..413158f441a7b 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -29,7 +29,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 using ExecutionContext = framework::ExecutionContext;
 using DeviceContextPool = platform::DeviceContextPool;
@@ -377,18 +376,18 @@ class MLUOpTensorDesc {
                   const mluOpDataType_t tensor_dtype,
                   int position);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   const mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype);
 
-  explicit MLUOpTensorDesc(const Tensor& tensor);
+  explicit MLUOpTensorDesc(const phi::DenseTensor& tensor);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype,
                   int position);
 
-  MLUOpTensorDesc(const Tensor& tensor,
+  MLUOpTensorDesc(const phi::DenseTensor& tensor,
                   mluOpTensorLayout_t layout,
                   const mluOpDataType_t tensor_dtype,
                   int position,
@@ -458,11 +457,11 @@ class MLUCnnlRandomGeneratorDesc {
  public:
   MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
   const cnnlRandGenerator_t get() const;
-  Tensor& get_state();
+  phi::DenseTensor& get_state();
   ~MLUCnnlRandomGeneratorDesc();
 
  private:
-  Tensor mlu_state;
+  phi::DenseTensor mlu_state;
   cnnlRandGenerator_t mlu_generator = nullptr;
 };
 
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 330f4ca3596bd..bd4451ebda46d 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 struct ModifiedHuberLossBackward {
   template <typename Tuple>
   HOSTDEVICE void operator()(Tuple t) const {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 50d5a14548e35..62600ed7c6970 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc
index b83bc8ea6541b..483c2bda72efa 100644
--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
index 206c7b041a9b3..74f3578c6e8d4 100644
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ b/paddle/fluid/operators/multinomial_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUMultinomialKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index 749849a333f3d..ba263427caa87 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MultiplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index a4b418b14cc84..4b9fe86b22565 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -31,7 +31,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
@@ -44,7 +43,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename DeviceContext, typename T>
 void PrepareSamples(const framework::ExecutionContext &context,
                     Sampler *sampler,
-                    Tensor *sample_labels) {
+                    phi::DenseTensor *sample_labels) {
   auto label = context.Input<phi::DenseTensor>("Label");
   const int64_t *label_data = label->data<int64_t>();
   auto label_dims = label->dims();
@@ -154,9 +153,9 @@ class NCEKernel : public framework::OpKernel<T> {
 
     std::vector<int64_t> sample_out_dims;
     auto label = context.Input<phi::DenseTensor>("Label");
-    Tensor *sample_labels;
-    Tensor *sample_out;
-    Tensor sample_labels_tmp, sample_out_tmp;
+    phi::DenseTensor *sample_labels;
+    phi::DenseTensor *sample_out;
+    phi::DenseTensor sample_labels_tmp, sample_out_tmp;
     if (is_test) {
       // set dims of output(SampleOut)
       int num_true_classes = label->dims().size() == 2 ? label->dims()[1] : 1;
@@ -339,7 +338,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
     }
 
     //    T b = 1. / num_total_classes * num_neg_samples;
-    Tensor sample_grad;  // tmp tensor
+    phi::DenseTensor sample_grad;  // tmp tensor
     T *sample_grad_data =
         sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
     // backward cost
diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc
index c5f0749227e23..619f902513459 100644
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ b/paddle/fluid/operators/norm_op_npu.cc
@@ -16,7 +16,6 @@ namespace paddle {
 namespace operators {
 
 using DDim = framework::DDim;
-using Tensor = phi::DenseTensor;
 
 void CheckAxis(int axis, int rank) {
   // check the axis is in [-rank, rank-1]
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index b6e27e6b54151..2412913995b95 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -37,7 +37,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
@@ -433,21 +432,21 @@ __global__ void DoubleGradComputeDDYWithGlobal(const T *ddx,
 template <typename DeviceContext, typename T>
 void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout,
-                           const Tensor *X,
-                           const Tensor *Scale,
-                           const Tensor *dY,
-                           const Tensor *Saved_mean,
-                           const Tensor *Saved_variance,
-                           const Tensor *Mean,
-                           const Tensor *Variance,
+                           const phi::DenseTensor *X,
+                           const phi::DenseTensor *Scale,
+                           const phi::DenseTensor *dY,
+                           const phi::DenseTensor *Saved_mean,
+                           const phi::DenseTensor *Saved_variance,
+                           const phi::DenseTensor *Mean,
+                           const phi::DenseTensor *Variance,
                            const double epsilon,
                            const bool use_global_stats,
-                           const Tensor *ddX,
-                           const Tensor *ddScale,
-                           const Tensor *ddBias,
-                           Tensor *dX,
-                           Tensor *dScale,
-                           Tensor *ddY) {
+                           const phi::DenseTensor *ddX,
+                           const phi::DenseTensor *ddScale,
+                           const phi::DenseTensor *ddBias,
+                           phi::DenseTensor *dX,
+                           phi::DenseTensor *dScale,
+                           phi::DenseTensor *ddY) {
   const T *x_data = X->data<T>();
   const T *dy_data = dY->data<T>();
   const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
@@ -463,7 +462,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
   const int N = x_dims[0];
   const int num = X->numel();
   const int sample_size = num / N / C;
-  Tensor scale_tmp;
+  phi::DenseTensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
     set_constant(ctx, &scale_tmp, static_cast<T>(1));
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 99623917d59ee..fdab03698711c 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -37,8 +37,6 @@ static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void initialize_zero_kernel(T* data, const int length) {
   CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast<T>(0); }
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index d878fd5a6d44b..41ec3eb9a135f 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -76,7 +76,6 @@ struct OneHotOpFunctor {
   }
 };
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class OneHotKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
index e2997dc079c61..35e8bcde9daad 100644
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotNPUKernel : public framework::OpKernel<T> {
@@ -54,7 +53,7 @@ class OneHotNPUKernel : public framework::OpKernel<T> {
           .AddOutput(*out);
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       const auto& cast_runner = NpuOpRunner(
           "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 66826cd4ff33a..e4f8555fceae2 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -22,8 +22,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class OneHotXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
index f98cbabf58a87..0b2fbfe85d403 100644
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotV2MLUKernel : public framework::OpKernel<T> {
@@ -44,10 +43,12 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 
     float on_value = 1.0f, off_value = 0.0f;
     const int in_off_dim[1] = {1};
-    Tensor on_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-        framework::DDim(in_off_dim, 1), dev_ctx);
-    Tensor off_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
-        framework::DDim(in_off_dim, 1), dev_ctx);
+    phi::DenseTensor on_value_tensor =
+        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+            framework::DDim(in_off_dim, 1), dev_ctx);
+    phi::DenseTensor off_value_tensor =
+        ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+            framework::DDim(in_off_dim, 1), dev_ctx);
     FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
     FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
 
@@ -64,7 +65,7 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
                       ToCnnlDataType(out->dtype()),
                       GetBasePtr(out));
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       // use cnnlCast to cast int64_t to int32_t then do one_hot
       MLUCnnlTensorDesc in_desc(*in);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
index 8cc97b417ca78..d305a04ea0782 100644
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class OneHotV2NPUKernel : public framework::OpKernel<T> {
@@ -53,7 +52,7 @@ class OneHotV2NPUKernel : public framework::OpKernel<T> {
           .AddOutput(*out);
       runner.Run(dev_ctx.stream());
     } else {
-      Tensor transformed_in;
+      phi::DenseTensor transformed_in;
       transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
       const auto& cast_runner = NpuOpRunner(
           "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 4390da3c4e479..262aa0fc350e2 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 4f800233c24fe..54643a39bcd4c 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -25,7 +25,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index aa331df4cbd0c..cf447bc593103 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -19,8 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class AdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
index c9c33643d1ee5..d998cff14126c 100644
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class AdamMLUKernel : public framework::OpKernel<T> {
  public:
@@ -156,9 +154,9 @@ class AdamMLUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
       beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
@@ -462,9 +460,9 @@ class MergedAdamMLUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index f94b32413a04a..356bef435e45c 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class AdamNPUKernel : public framework::OpKernel<T> {
  public:
@@ -132,9 +130,9 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     const phi::DenseTensor* beta2_tensor = nullptr;
     const phi::DenseTensor* epsilon_tensor = nullptr;
 
-    Tensor beta1_tmp(experimental::DataType::FLOAT32);
-    Tensor beta2_tmp(experimental::DataType::FLOAT32);
-    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta1_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor beta2_tmp(experimental::DataType::FLOAT32);
+    phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
 
     if (ctx.HasInput("Beta1Tensor")) {
       beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
@@ -286,9 +284,9 @@ class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
           ctx.template device_context<paddle::platform::NPUDeviceContext>()
               .stream();
 
-      Tensor one(experimental::DataType::FLOAT32);
-      Tensor decay(experimental::DataType::FLOAT32);
-      Tensor tmp(experimental::DataType::FLOAT32);
+      phi::DenseTensor one(experimental::DataType::FLOAT32);
+      phi::DenseTensor decay(experimental::DataType::FLOAT32);
+      phi::DenseTensor tmp(experimental::DataType::FLOAT32);
 
       tmp.mutable_data<float>({1}, place);
       one.mutable_data<float>({1}, place);
diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
index 5298030f17a04..12429933e03d3 100644
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ b/paddle/fluid/operators/optimizers/adamax_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5ab3ef3b2e61c..6c73439c62551 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class DecayedAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index e866a97f1ddcc..f5710f2e7d8eb 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class DpsgdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc
index b81a6c5ab6bb7..22be1f5ac685a 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class FTRLOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h
index 97b1a09766b68..99e210ce51e96 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.h
+++ b/paddle/fluid/operators/optimizers/ftrl_op.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc
index 8e4ff40372a12..867cfe0268c51 100644
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MergedAdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index c390a12863bc4..ea74dba1c54d1 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -134,7 +134,8 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
 
-    Tensor mu_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    phi::DenseTensor mu_tensor =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
     MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
     MLUCnnl::Fill(ctx,
                   CNNL_POINTER_MODE_HOST,
@@ -158,7 +159,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
       auto velocity_out = velocitys_out[idx];
 
       auto grad = grads[idx];
-      Tensor regularized_grad;
+      phi::DenseTensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param_out);
       if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 4171f0c11955a..538028139b8c4 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext* ctx) const override {
@@ -38,24 +36,24 @@ class MomentumOpInferVarType : public framework::VarTypeInference {
 
 void MomentumOpMaker::Make() {
   AddInput("Param",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input parameter that has to be updated");
   AddInput("Grad",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input gradient of the parameter");
   AddInput("Velocity",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input velocity (corresponding to the parameter) "
            "that has to be updated");
   AddInput("LearningRate",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input learning rate");
   AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
   AddOutput("ParamOut",
-            "(Tensor) This output is updated parameter. "
+            "(phi::DenseTensor) This output is updated parameter. "
             "It shared memory with Input(Param).");
   AddOutput("VelocityOut",
-            "(Tensor) This output is updated velocity. "
+            "(phi::DenseTensor) This output is updated velocity. "
             "It shared memory with Input(Velocity).");
   AddOutput("MasterParamOut",
             "The updated FP32 master weight for AMP. "
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index 4bebb4264cc29..b37e7aa99f793 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -50,7 +50,7 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
     auto* grad_var = ctx.InputVar("Grad");
     if (grad_var->IsType<phi::DenseTensor>()) {
       auto grad = ctx.Input<phi::DenseTensor>("Grad");
-      Tensor mu_tensor =
+      phi::DenseTensor mu_tensor =
           ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
       MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
       MLUCnnl::Fill(ctx,
@@ -59,7 +59,7 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
                     mu_tensor_desc.get(),
                     GetBasePtr(&mu_tensor));
 
-      Tensor regularized_grad;
+      phi::DenseTensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param);
       if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad =
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 2da5bed7642c1..598b84415f9ec 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class ProximalAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
index 136e416307ab0..72eccd17e4489 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 061e495c4bacd..21b145ee49d7c 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class ProximalGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
index 024062045ae43..49cf7b68bd32a 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
index 579bc76be5f47..abbe7ddcc5b61 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_npu.cc
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class RMSPROPNPUKernel : public framework::OpKernel<T> {
  public:
@@ -46,18 +44,18 @@ class RMSPROPNPUKernel : public framework::OpKernel<T> {
       auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
       if (centered) {
         framework::NPUAttributeMap attr_input = {{"use_locking", false}};
-        const Tensor *rho_tensor = nullptr;
-        const Tensor *momentum_tensor = nullptr;
-        const Tensor *epsilon_tensor = nullptr;
-        Tensor rho_tmp(experimental::DataType::FLOAT32);
+        const phi::DenseTensor *rho_tensor = nullptr;
+        const phi::DenseTensor *momentum_tensor = nullptr;
+        const phi::DenseTensor *epsilon_tensor = nullptr;
+        phi::DenseTensor rho_tmp(experimental::DataType::FLOAT32);
         rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&rho_tmp, rho);
         rho_tensor = &rho_tmp;
-        Tensor momentum_tmp(experimental::DataType::FLOAT32);
+        phi::DenseTensor momentum_tmp(experimental::DataType::FLOAT32);
         momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
         momentum_tensor = &momentum_tmp;
-        Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+        phi::DenseTensor epsilon_tmp(experimental::DataType::FLOAT32);
         epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
         FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
         epsilon_tensor = &epsilon_tmp;
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
index 3e072a5e17a64..f59171e3ae7c4 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
@@ -19,8 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SparseMomentumOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext* ctx) const override {
@@ -36,30 +34,31 @@ class SparseMomentumOpInferVarType : public framework::VarTypeInference {
 
 void SparseMomentumOpMaker::Make() {
   AddInput("Param",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input parameter that has to be updated");
   AddInput("Grad",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input gradient of the parameter");
   AddInput("Velocity",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input velocity (corresponding to the parameter) "
            "that has to be updated");
   AddInput("Index",
-           "(Tensor, default Tensor<int>) "
+           "(phi::DenseTensor, default phi::DenseTensor<int>) "
            "Input index of Param to do update operation");
   AddInput("Axis",
-           "The Tensor which contains the axis that we do update operation.")
+           "The phi::DenseTensor which contains the axis that we do update "
+           "operation.")
       .AsDispensable();
   AddInput("LearningRate",
-           "(Tensor, default Tensor<float>) "
+           "(phi::DenseTensor, default phi::DenseTensor<float>) "
            "Input learning rate");
   AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
   AddOutput("ParamOut",
-            "(Tensor) This output is updated parameter. "
+            "(phi::DenseTensor) This output is updated parameter. "
             "It shared memory with Input(Param).");
   AddOutput("VelocityOut",
-            "(Tensor) This output is updated velocity. "
+            "(phi::DenseTensor) This output is updated velocity. "
             "It shared memory with Input(Velocity).");
   AddOutput("MasterParamOut",
             "The updated FP32 master weight for AMP. "
diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc
index 9d312dd572a45..5c86cf188c021 100644
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ b/paddle/fluid/operators/p_norm_op_npu.cc
@@ -62,7 +62,7 @@ class PnormNPUKernel : public framework::OpKernel<T> {
                                         {"keep_dims", keepdim}});
       runner.Run(stream);
     } else {
-      Tensor tmp_x;
+      phi::DenseTensor tmp_x;
       tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
 
       const auto& power_runner1 =
@@ -93,7 +93,6 @@ template <typename DeviceContext, typename T>
 class PnormGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = phi::DenseTensor;
     auto* x = ctx.Input<phi::DenseTensor>("X");
     auto* y = ctx.Input<phi::DenseTensor>("Out");
     auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
@@ -113,8 +112,8 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    Tensor y_share(y->type());
-    Tensor dy_share(dy->type());
+    phi::DenseTensor y_share(y->type());
+    phi::DenseTensor dy_share(dy->type());
     y_share.ShareDataWith(*y);
     dy_share.ShareDataWith(*dy);
     auto ydim = xdim;
@@ -130,22 +129,22 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
       FillNpuTensorWithConstant(dx, static_cast<T>(0));
       dx->Resize(xdim);
     } else if (porder == INFINITY || porder == -INFINITY) {
-      Tensor x_abs;
+      phi::DenseTensor x_abs;
       x_abs.mutable_data<T>(xdim, place);
       const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
       r_abs.Run(stream);
 
-      Tensor t_cond;
+      phi::DenseTensor t_cond;
       t_cond.mutable_data<bool>(xdim, place);
       const auto& r_equal =
           NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {});
       r_equal.Run(stream);
 
-      Tensor t_zero;
+      phi::DenseTensor t_zero;
       t_zero.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
 
-      Tensor x_sign;
+      phi::DenseTensor x_sign;
       x_sign.mutable_data<T>(xdim, place);
       const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
       r_sign.Run(stream);
@@ -157,17 +156,17 @@ class PnormGradNPUKernel : public framework::OpKernel<T> {
           NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {});
       r_sel.Run(stream);
     } else {
-      Tensor x_abs;
+      phi::DenseTensor x_abs;
       x_abs.mutable_data<T>(xdim, place);
       const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
       r_abs.Run(stream);
 
-      Tensor x_sign;
+      phi::DenseTensor x_sign;
       x_sign.mutable_data<T>(xdim, place);
       const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
       r_sign.Run(stream);
 
-      Tensor y_pow;
+      phi::DenseTensor y_pow;
       y_pow.mutable_data<T>(ydim, place);
       if (porder >= 1) {
         const auto& r_pow1 = NpuOpRunner(
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
index 7694e0edbf9f9..497dc51e39f0d 100644
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ b/paddle/fluid/operators/pad3d_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static inline std::vector<int> GetPaddings(
     const framework::ExecutionContext& context) {
   std::vector<int> paddings(6);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
index 425defc9792c7..27443b8b425d7 100644
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ b/paddle/fluid/operators/pad_op_npu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class PadNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index e5a066fdc6539..01095b6d429b4 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class PartialConcatOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index ef52bbad525a4..f4acf68dcbc70 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -23,8 +23,6 @@ namespace operators {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
-using Tensor = phi::DenseTensor;
-
 template <class T>
 __global__ void ConcatPartialCUDAKernel(T **in,
                                         T *out,
@@ -72,7 +70,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
     PADDLE_ENFORCE_EQ(in_vars[0] != nullptr,
                       true,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index c0c6b2f607526..050752f23888b 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index aa2f30aaafc2c..6473f8d603789 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class PartialSumOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index c92e9618bfce0..093e0032b3cb9 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -23,8 +23,6 @@ namespace operators {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
-using Tensor = phi::DenseTensor;
-
 template <class T>
 __global__ void SumArrayPartialCUDAKernel(T **in,
                                           T *out,
@@ -77,7 +75,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto in_vars = ctx.MultiInput<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
 
     PADDLE_ENFORCE_EQ(
         in_vars[0] != nullptr,
@@ -150,7 +148,7 @@ template <typename T>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const Tensor *out_grad =
+    const phi::DenseTensor *out_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto ins = ctx.MultiInput<phi::DenseTensor>("X");
     auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index 26f5039e6363f..fa4cc19d5e2c3 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PartialSumKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 25d2ac8ce0d7a..c160dc28bfda4 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -109,12 +109,12 @@ framework::OpKernelType PoolOpGrad::GetKernelTypeForVar(
 void Pool2dOpMaker::Make() {
   AddInput(
       "X",
-      "(Tensor) The input tensor of pooling operator. "
+      "(phi::DenseTensor) The input tensor of pooling operator. "
       "The format of input tensor is NCHW, where N is batch size, C is the "
       "number of channels, H is the height of the feature, "
       "and W is the width of the feature.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator. "
+            "(phi::DenseTensor) The output tensor of pooling operator. "
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
@@ -301,14 +301,14 @@ class PoolOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
 
 void Pool3dOpMaker::Make() {
   AddInput("X",
-           "(Tensor) The input tensor of pooling operator. "
+           "(phi::DenseTensor) The input tensor of pooling operator. "
            "The format of input tensor is NCDHW or NDHWC, where N is batch "
            "size, C is "
            "the number of channels, and D, H and W is the depth, height and "
            "width of "
            "the feature, respectively.");
   AddOutput("Out",
-            "(Tensor) The output tensor of pooling operator."
+            "(phi::DenseTensor) The output tensor of pooling operator."
             "The format of output tensor is also NCDHW or NDHWC, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index c08b589cbe12e..fd2c0ce15b461 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index e2af30faf36f4..6e422a645fffb 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -46,8 +46,8 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
+    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
@@ -212,11 +212,11 @@ class MLUPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
-    const Tensor *out_grad =
+    const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
+    const phi::DenseTensor *out = ctx.Input<phi::DenseTensor>("Out");
+    const phi::DenseTensor *out_grad =
         ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad =
+    phi::DenseTensor *in_x_grad =
         ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     in_x_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index 1cc89cda21bc7..745b793f51147 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 51aa3dcd39a35..8a2199e0231bf 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PReluOp : public framework::OperatorWithKernel {
  public:
   PReluOp(const std::string &type,
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index 9b3146c3b8487..ca291187b9cdd 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index b24ded79dd050..d1aa1d37d0479 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc
index 5eead81365053..a24b234a05da7 100644
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
@@ -28,7 +28,6 @@ extern "C" {
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 class PyramidHashOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index f7f111299c73d..afe45894d5c1a 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -29,8 +29,6 @@ static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 __global__ void random_routing_kernel(int64_t* data,
                                       const int64_t length,
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 4c740c5985ade..80bd022aff340 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 class RankAttentionOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
index ce06d1b1089a5..41de1f6b1300a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class ReduceAnyNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d623bf23534a0..aec1640181bcc 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -33,8 +33,6 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-using Tensor = phi::DenseTensor;
-
 USE_OP_ITSELF(reduce_any);
 USE_OP_DEVICE_KERNEL(reduce_any, NPU);
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index a23931c0aa246..ca19b9e6e52da 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -96,9 +96,10 @@ template <typename T>
 class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Input<Tensor>("Out");
-    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<phi::DenseTensor>("X");
+    auto* out = context.Input<phi::DenseTensor>("Out");
+    auto* out_grad =
+        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
     auto reduce_dims = context.Attr<std::vector<int>>("dim");
     bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
@@ -108,7 +109,8 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
         true,
         platform::errors::InvalidArgument(
             "MLU only support in_dtype == -1 in reduce_max_grad op."));
-    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* x_grad =
+        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     x_grad->mutable_data<T>(context.GetPlace());
 
     auto place = context.GetPlace();
@@ -122,7 +124,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out, tmp_out_grad;
+    phi::DenseTensor tmp_out, tmp_out_grad;
     auto tmp_out_dims_vec = x_dims_vec;
     for (auto d : reduce_dims) {
       if (d < 0) {
@@ -136,7 +138,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
     tmp_out_grad.ShareDataWith(*out_grad);
     tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
 
-    Tensor transformed_out(x->type());
+    phi::DenseTensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
 
@@ -149,7 +151,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                          transformed_out_desc.get(),
                          GetBasePtr(&transformed_out));
 
-    Tensor transformed_out_grad(x->type());
+    phi::DenseTensor transformed_out_grad(x->type());
     transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
     transformed_out_grad.mutable_data<T>(place);
     MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
@@ -162,7 +164,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                          GetBasePtr(&transformed_out_grad));
 
     // compare
-    Tensor equal_cond;
+    phi::DenseTensor equal_cond;
     equal_cond.mutable_data<bool>(x_grad->dims(), place);
 
     MLUCnnlTensorDesc x_desc(*x);
@@ -178,7 +180,7 @@ class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
                    GetBasePtr(&equal_cond));
 
     // select
-    Tensor t_zero;
+    phi::DenseTensor t_zero;
     t_zero.mutable_data<T>(x_grad->dims(), place);
     FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
     t_zero.Resize(x_grad->dims());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 172786963e4c9..1ade0c6746918 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMaxNPUKernel : public framework::OpKernel<T> {
  public:
@@ -77,8 +76,8 @@ class ReduceMaxNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
     if (framework::TransToProtoVarType(x->dtype()) ==
         framework::proto::VarType::INT64) {
-      auto op_func = [](const std::vector<Tensor>& inputs,
-                        const std::vector<Tensor>& outputs,
+      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                        const std::vector<phi::DenseTensor>& outputs,
                         const NPUAttributeMap& attrs,
                         const platform::NPUDeviceContext& dev_ctx) {
         const auto& runner =
@@ -147,7 +146,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out, tmp_out_grad;
+    phi::DenseTensor tmp_out, tmp_out_grad;
     auto tmp_out_dims_vec = x_dims_vec;
     for (auto d : reduce_dims) {
       if (d < 0) {
@@ -161,7 +160,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     tmp_out_grad.ShareDataWith(*out_grad);
     tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
 
-    Tensor transformed_out(x->type());
+    phi::DenseTensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
     NpuOpRunner r_brd_out;
@@ -170,7 +169,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out)
         .Run(stream);
-    Tensor transformed_out_grad(x->type());
+    phi::DenseTensor transformed_out_grad(x->type());
     transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
     transformed_out_grad.mutable_data<T>(place);
     NpuOpRunner r_brd_out_grad;
@@ -181,14 +180,14 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
         .Run(stream);
 
     // compare
-    Tensor equal_cond;
+    phi::DenseTensor equal_cond;
     equal_cond.mutable_data<bool>(x_grad->dims(), place);
     const auto& r_equal =
         NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
     r_equal.Run(stream);
 
     // select
-    Tensor t_zero;
+    phi::DenseTensor t_zero;
     t_zero.mutable_data<T>(x_grad->dims(), place);
     FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
     t_zero.Resize(x_grad->dims());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
index b73bde6275347..d1658c24733c9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -54,7 +54,7 @@ class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
       reduce_numel *= input_dims[d];
     }
 
-    Tensor tmp_output_grad(output_grad->dtype());
+    phi::DenseTensor tmp_output_grad(output_grad->dtype());
     auto tmp_output_dims = input_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
index feca58ce19861..35273df44d1e2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_npu.cc
@@ -81,7 +81,7 @@ class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
       reduce_numel *= input_dims[d];
     }
 
-    Tensor tensor_value(input_grad->dtype());
+    phi::DenseTensor tensor_value(input_grad->dtype());
     tensor_value.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(
         &tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
@@ -96,8 +96,8 @@ class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
         .AddOutput(*input_grad)
         .Run(stream);
 
-    Tensor transformed_input_grad, transformed_out_grad;
-    Tensor tmp_output_grad;
+    phi::DenseTensor transformed_input_grad, transformed_out_grad;
+    phi::DenseTensor tmp_output_grad;
     auto tmp_output_dims = input_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
index 19efb2e6bfb4c..e7401d7917763 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceMinNPUKernel : public framework::OpKernel<T> {
  public:
@@ -76,8 +75,8 @@ class ReduceMinNPUKernel : public framework::OpKernel<T> {
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
     if (x->dtype() == experimental::DataType::INT64) {
-      auto op_func = [](const std::vector<Tensor>& inputs,
-                        const std::vector<Tensor>& outputs,
+      auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
+                        const std::vector<phi::DenseTensor>& outputs,
                         const NPUAttributeMap& attrs,
                         const platform::NPUDeviceContext& dev_ctx) {
         const auto& runner =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 027a787cbf50b..0cc7bf2898f86 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -48,7 +48,6 @@ namespace operators {
             keep_dim);                                           \
   }
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline void GetShuffledDim(const DDim& src_dims,
@@ -137,7 +136,7 @@ void HandleLargeDim(const framework::ExecutionContext& context,
                     const std::vector<int>& dims,
                     bool keep_dim) {
   //  shuffle the reduced dim to the end
-  Tensor shuffled_input;
+  phi::DenseTensor shuffled_input;
   GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
 
   // transpose to 2D tensor whose shape is {unreduced, reduced}.
@@ -168,7 +167,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   DDim out_dim(out->dims());
   DDim x_dim(x->dims());
   // transpose and reshape X
-  Tensor shuffled_x;
+  phi::DenseTensor shuffled_x;
   GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
   DDim shuffled_dim = shuffled_x.dims();
   shuffled_x.Resize({unreduced, reduced});
@@ -185,7 +184,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   // transpose dX
   std::vector<int> origin_axis(x_dim.size());
   GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
-  Tensor dx_tmp;
+  phi::DenseTensor dx_tmp;
   framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
   dx_tmp.Resize(shuffled_dim);
   dx->Resize(x_dim);
@@ -453,7 +452,7 @@ class ReduceGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     int in_dtype = context.Attr<int>("in_dtype");
     if (in_dtype >= 0) {
-      Tensor tmp_tensor;
+      phi::DenseTensor tmp_tensor;
       auto* pre_input =
           context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
       auto in_kernel_type = framework::OpKernelType(
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
index 39a0dc044f272..3176e489f89b3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_function.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -21,7 +21,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename T,
           size_t D,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
index 85b589ebf916e..ae42501f7e395 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename DeviceContext, typename T>
 class ReduceProdNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
index 69c8935dafd6b..7b1b6bc831f0e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
@@ -80,7 +80,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
       int in_dtype = context.Attr<int>("out_dtype");
 
       if (in_dtype >= 0) {
-        Tensor tmp_tensor;
+        phi::DenseTensor tmp_tensor;
         auto* pre_input =
             context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
         auto in_kernel_type = framework::OpKernelType(
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
index 4ecf6e907b4cb..130c617f873ba 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
@@ -52,7 +52,7 @@ class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor tmp_out(out_grad->dtype());
+    phi::DenseTensor tmp_out(out_grad->dtype());
     auto tmp_output_dims = in_dims;
     for (auto d : reduce_dims) {
       tmp_output_dims[d] = 1;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index 6ba8a9c1373a1..9588aa54f3877 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -129,7 +129,7 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
       out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
           dims, out_grad->dims());
 
-      Tensor out_grad_tmp(out_grad->type());
+      phi::DenseTensor out_grad_tmp(out_grad->type());
       out_grad_tmp.Resize(out_dims);
       out_grad_tmp.mutable_data<T>(ctx.GetPlace());
       framework::TensorCopy(
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 161f230bacbe4..42e6929508bff 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -41,8 +41,6 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ReshapeOp : public framework::OperatorWithKernel {
  public:
   ReshapeOp(const std::string &type,
@@ -272,7 +270,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
@@ -638,7 +636,7 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
@@ -666,7 +664,7 @@ class Reshape2DoubleGradOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/rnn_op_mlu.cc b/paddle/fluid/operators/rnn_op_mlu.cc
index cf4e255668232..1773c526b4635 100644
--- a/paddle/fluid/operators/rnn_op_mlu.cc
+++ b/paddle/fluid/operators/rnn_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 using TensorList = std::vector<phi::DenseTensor>;
 template <typename TensorType, typename T>
@@ -459,7 +458,7 @@ class RNNMLUGradKernel : public framework::OpKernel<T> {
     input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
     FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
 
-    Tensor a, b;
+    phi::DenseTensor a, b;
     phi::DenseTensor* dynamic_grad_pre_h = &a;
     phi::DenseTensor* dynamic_grad_pre_c = &b;
     if (init_h_grad) {
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 6a7999c56557f..4407fbf1a8c96 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ROIAlignOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/roi_align_op_mlu.cc b/paddle/fluid/operators/roi_align_op_mlu.cc
index 5bde4dd7b6686..de0a8be93452d 100644
--- a/paddle/fluid/operators/roi_align_op_mlu.cc
+++ b/paddle/fluid/operators/roi_align_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
  public:
@@ -76,7 +74,7 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(lod.empty(),
                         false,
                         platform::errors::InvalidArgument(
-                            "Input(ROIs) Tensor of ROIAlignOp "
+                            "Input(ROIs) phi::DenseTensor of ROIAlignOp "
                             "does not contain LoD information."));
       auto rois_lod = lod.back();
       rois_batch_size = rois_lod.size() - 1;
@@ -110,7 +108,7 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
     }
 
     // only support float32 for now
-    Tensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
     rois_cpu.Resize({rois_num, 4});
     rois_cpu.mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
@@ -119,8 +117,8 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
     T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
 
     // boxes; [batch_idx, x1, y1, x2, y2]
-    Tensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    Tensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
     boxes_cpu.Resize({rois_num, 5});
     boxes_mlu.Resize({rois_num, 5});
     T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
@@ -139,8 +137,8 @@ class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    Tensor input_nhwc(in->type());
-    Tensor output_nhwc(out->type());
+    phi::DenseTensor input_nhwc(in->type());
+    phi::DenseTensor output_nhwc(out->type());
     TransposeFromMLUTensor<T>(
         ctx, perm_to_nhwc, in, &input_nhwc, true /*need_reshape_or_alloc*/);
     auto output_dims = out->dims();
@@ -221,7 +219,7 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
     rois_cpu.Resize({rois_num, 4});
     rois_cpu.mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
@@ -230,8 +228,8 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
     T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
 
     // boxes; [batch_idx, x1, y1, x2, y2]
-    Tensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
-    Tensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
+    phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
     boxes_cpu.Resize({rois_num, 5});
     boxes_mlu.Resize({rois_num, 5});
     T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
@@ -250,8 +248,8 @@ class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
 
     const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
     const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
-    Tensor grads_nhwc(out_grad->type());
-    Tensor grads_image_nhwc(in_grad->type());
+    phi::DenseTensor grads_nhwc(out_grad->type());
+    phi::DenseTensor grads_image_nhwc(in_grad->type());
     TransposeFromMLUTensor<T>(ctx,
                               perm_to_nhwc,
                               out_grad,
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index 72578ca0177c0..06be3f35b3f23 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename DeviceContext, typename T>
 class ROIAlignNPUKernel : public framework::OpKernel<T> {
@@ -54,7 +53,7 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
     int dtype =
         static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
     framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}};
-    Tensor ROIsNum_fp(ROIs->dtype());
+    phi::DenseTensor ROIsNum_fp(ROIs->dtype());
     ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1}));
     ROIsNum_fp.mutable_data<T>(ctx.GetPlace());
 
@@ -68,7 +67,7 @@ class ROIAlignNPUKernel : public framework::OpKernel<T> {
     x_list.push_back(*ROIs);
     auto axis = 1;
     // output of concate
-    Tensor ROIs_N5(ROIs->dtype());
+    phi::DenseTensor ROIs_N5(ROIs->dtype());
     ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5}));
     ROIs_N5.mutable_data<T>(ctx.GetPlace());
 
@@ -137,9 +136,9 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
 
     // Cast RoisNum to fp32 tensor
     auto* RoisNum = ctx.Input<phi::DenseTensor>("RoisNum");
-    Tensor ROIs_N5;
+    phi::DenseTensor ROIs_N5;
     ROIs_N5.mutable_data<float>({rois_num, 5}, place);
-    Tensor ROIsNum_fp;
+    phi::DenseTensor ROIsNum_fp;
     ROIsNum_fp.mutable_data<T>(RoisNum->dims(), place);  // shape = [rois_num]
     int nputype_fp32 =
         static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
@@ -161,7 +160,7 @@ class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
     //  function
 #if (CANN_VERSION_CODE < 504000)
     std::vector<float> vec_dlt = {0, 0, 0, -1.0f, -1.0f};
-    Tensor tsr_dlt;
+    phi::DenseTensor tsr_dlt;
     tsr_dlt.mutable_data<float>({5}, place);
     framework::TensorFromVector<float>(vec_dlt, ctx.device_context(), &tsr_dlt);
     ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index b2e8a6ae58883..e79975e6254eb 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class ROIPoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 8871627b85242..7d61088dd9fd6 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -112,7 +112,6 @@ __global__ void gpu_compute_remove_accidental_hits(const int size,
 template <typename T>
 class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     // get necessary inputs
     const phi::DenseTensor* logits = context.Input<phi::DenseTensor>("Logits");
@@ -165,16 +164,17 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
           context.Input<phi::DenseTensor>("CustomizedSamples");
       const phi::DenseTensor* customized_probabilities =
           context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(customized_samples,
-                        samples,
-                        platform::errors::InvalidArgument(
-                            "CustomizedSamples must be the same Tensor with "
-                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_samples,
+          samples,
+          platform::errors::InvalidArgument(
+              "CustomizedSamples must be the same phi::DenseTensor with "
+              "Samples when use_customized_samples = True"));
       PADDLE_ENFORCE_EQ(
           customized_probabilities,
           probabilities,
           platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same Tensor with "
+              "CustomizedProbabilities must be the same phi::DenseTensor with "
               "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
@@ -238,7 +238,6 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     auto logits_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 584d115d28ff3..fe53a12e5ed71 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -212,7 +211,6 @@ static void compute_remove_accidental_hits(const platform::DeviceContext& ctx,
 template <typename T>
 class SampleLogitsKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(context.GetPlace()),
@@ -264,16 +262,17 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
           context.Input<phi::DenseTensor>("CustomizedSamples");
       const phi::DenseTensor* customized_probabilities =
           context.Input<phi::DenseTensor>("CustomizedProbabilities");
-      PADDLE_ENFORCE_EQ(customized_samples,
-                        samples,
-                        platform::errors::InvalidArgument(
-                            "CustomizedSamples must be the same Tensor with "
-                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_samples,
+          samples,
+          platform::errors::InvalidArgument(
+              "CustomizedSamples must be the same phi::DenseTensor with "
+              "Samples when use_customized_samples = True"));
       PADDLE_ENFORCE_EQ(
           customized_probabilities,
           probabilities,
           platform::errors::InvalidArgument(
-              "CustomizedProbabilities must be the same Tensor with "
+              "CustomizedProbabilities must be the same phi::DenseTensor with "
               "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
@@ -308,7 +307,6 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
 template <typename T>
 class SampleLogitsGradKernel : public framework::OpKernel<T> {
  public:
-  using Tensor = phi::DenseTensor;
   void Compute(const framework::ExecutionContext& context) const override {
     auto logits_grad =
         context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index 6d2d3f4a60047..7e84077fd60ae 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SamplingIdOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 43c0bdcf4043e..e5c4f744db4a5 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -27,8 +27,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SamplingIdKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 41780561144b1..71f78911456f7 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SaveCombineOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/scatter_op_mlu.cc b/paddle/fluid/operators/scatter_op_mlu.cc
index a4cb5d7424936..83cbbbd7b9e69 100644
--- a/paddle/fluid/operators/scatter_op_mlu.cc
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
@@ -42,7 +42,7 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
                                  GetBasePtr(indices),
                                  mode);
     } else {
-      Tensor tensor_zeros(updates->type());
+      phi::DenseTensor tensor_zeros(updates->type());
       tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
       float value = 0.0;
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index 6bffd24734055..ded722c7eb794 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ScatterNPUKernel : public framework::OpKernel<T> {
  public:
@@ -49,16 +47,16 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
 
     const auto& dev_ctx =
         ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func_update = [](const std::vector<Tensor>& inputs,
-                             const std::vector<Tensor>& outputs,
+    auto op_func_update = [](const std::vector<phi::DenseTensor>& inputs,
+                             const std::vector<phi::DenseTensor>& outputs,
                              const NPUAttributeMap& attrs,
                              const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner =
           NpuOpRunner("TensorScatterUpdate", inputs, outputs, attrs);
       runner.Run(dev_ctx.stream());
     };
-    auto op_func_add = [](const std::vector<Tensor>& inputs,
-                          const std::vector<Tensor>& outputs,
+    auto op_func_add = [](const std::vector<phi::DenseTensor>& inputs,
+                          const std::vector<phi::DenseTensor>& outputs,
                           const NPUAttributeMap& attrs,
                           const platform::NPUDeviceContext& dev_ctx) {
       const auto& runner =
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 34728c86c56b6..15f87803f5ab8 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -28,7 +28,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc
index 88a1884ae53e4..93d57aedd8aff 100644
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
@@ -17,7 +17,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 class SeedOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h
index a1c3484b7a728..c3cbc16fb4884 100644
--- a/paddle/fluid/operators/seed_op.h
+++ b/paddle/fluid/operators/seed_op.h
@@ -19,7 +19,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 static int get_seed(const framework::ExecutionContext& context) {
   int user_seed = context.Attr<int>("seed");
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 86049bec1eb21..a41b0f5f2b996 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -36,8 +36,6 @@ class OpBase;
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SetValue : public framework::OperatorWithKernel {
  public:
   SetValue(const std::string &type,
@@ -55,7 +53,7 @@ class SetValue : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensorList" || var_name == "EndsTensorList" ||
         var_name == "StepsTensorList") {
@@ -70,24 +68,28 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     // Input
-    AddInput("Input", "(Tensor) Input tensor of set_value operator.");
-    AddInput("ValueTensor", "(Tensor) Value tensor of set_value operator.")
+    AddInput("Input", "(phi::DenseTensor) Input tensor of set_value operator.");
+    AddInput("ValueTensor",
+             "(phi::DenseTensor) Value tensor of set_value operator.")
         .AsDispensable();
     AddInput("StartsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must be [1]."
              "It has higher priority compare with attr(starts).")
         .AsDuplicable()
         .AsDispensable();
     AddInput("EndsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must BE [1]."
              "It has higher priority compare with attr(ends).")
         .AsDuplicable()
         .AsDispensable();
 
     AddInput("StepsTensorList",
-             "(vector<Tensor<int32>>, optional) If provided, set_value will "
+             "(vector<phi::DenseTensor<int32>>, optional) If provided, "
+             "set_value will "
              "use this. The shape of the tensor in vector must BE [1]."
              "It has higher priority compare with attr(steps).")
         .AsDuplicable()
@@ -95,8 +97,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
 
     // Output
     AddOutput("Out",
-              "(Tensor) Output tensor of set_value operator. The output is the "
-              "same Tensor as input");
+              "(phi::DenseTensor) Output tensor of set_value operator. The "
+              "output is the "
+              "same phi::DenseTensor as input");
 
     // Attr
     AddAttr<int>("dtype", "data type of input.")
@@ -142,7 +145,7 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>("shape", "(vector<int64_t>) Shape of values.")
         .SetDefault({});
     AddComment(R"DOC(SetValue operator.
-Assignment to a Tensor in static mode.
+Assignment to a phi::DenseTensor in static mode.
 )DOC");
   }
 };
@@ -220,7 +223,7 @@ class SetValueGrad : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensorList" || var_name == "EndsTensorList" ||
         var_name == "StepsTensorList") {
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 7ef766020251b..d4ed1ce586e8f 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -31,7 +31,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 
 inline std::string GetValueName(framework::proto::VarType::Type data_type) {
diff --git a/paddle/fluid/operators/set_value_op_mlu.cc b/paddle/fluid/operators/set_value_op_mlu.cc
index 1b950a6da6084..06369b83bbab9 100644
--- a/paddle/fluid/operators/set_value_op_mlu.cc
+++ b/paddle/fluid/operators/set_value_op_mlu.cc
@@ -102,7 +102,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
       ends_indices[axis_index] = static_cast<int>(ends[i]);
       strides_indices[axis_index] = static_cast<int>(steps[i]);
     }
-    Tensor value_t(in->type());
+    phi::DenseTensor value_t(in->type());
     if (value_tensor != nullptr) {
       value_t.ShareDataWith(*value_tensor);
     } else {
@@ -116,7 +116,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
       value_t.Resize(value_dims);
     }
 
-    Tensor value_temp(in->type());
+    phi::DenseTensor value_temp(in->type());
     if (slice_dims_for_assign == value_t.dims()) {
       value_temp.ShareDataWith(value_t);
     } else {
@@ -133,7 +133,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
 
     int64_t input_numel = phi::product(in_dims);
     int64_t value_numel = phi::product(value_temp.dims());
-    Tensor in_temp, out_temp, val_temp, index_out;
+    phi::DenseTensor in_temp, out_temp, val_temp, index_out;
     int64_t stride_step = phi::product(in_dims);
     std::vector<int64_t> index_indices(stride_step);
     std::iota(index_indices.begin(), index_indices.end(), 0);
@@ -185,7 +185,7 @@ class SetValueMLUKernel : public framework::OpKernel<T> {
         phi::product(slice_dims_for_assign),
         platform::errors::InvalidArgument(
             "OP(set_value) error index indices and value update not match "));
-    Tensor index_final;
+    phi::DenseTensor index_final;
     index_final.ShareDataWith(index_out);
     int64_t indices_numel = phi::product(index_dims);
     auto new_index_dims = phi::make_ddim({indices_numel});
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 7526b13311b05..9dde6c6fbb3c0 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -132,7 +132,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "OP(set_value) error index indices and value update not match "));
 
-    Tensor value_t(in->type());
+    phi::DenseTensor value_t(in->type());
     if (value_tensor != nullptr) {
       value_t.ShareDataWith(*value_tensor);
     } else {
@@ -148,7 +148,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    Tensor value_temp(in->type());
+    phi::DenseTensor value_temp(in->type());
     if (slice_dims_for_assign == value_t.dims()) {
       value_temp.ShareDataWith(value_t);
     } else {
@@ -165,7 +165,7 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
     int64_t input_numel = phi::product(in_dims);
     int64_t index_numel = index_indices.size();
 
-    Tensor in_temp, out_temp, val_temp;
+    phi::DenseTensor in_temp, out_temp, val_temp;
     in_temp.ShareDataWith(*in);
     out_temp.ShareDataWith(*out);
     val_temp.ShareDataWith(value_temp);
diff --git a/paddle/fluid/operators/shape_op_mlu.cc b/paddle/fluid/operators/shape_op_mlu.cc
index bd51b49851840..f69a202819935 100644
--- a/paddle/fluid/operators/shape_op_mlu.cc
+++ b/paddle/fluid/operators/shape_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename T>
@@ -39,7 +38,7 @@ class ShapeMLUKernel : public framework::OpKernel<T> {
     out_t->mutable_data<int32_t>(ctx.GetPlace());
 
     // shape op cpu
-    Tensor shape_on_cpu(
+    phi::DenseTensor shape_on_cpu(
         framework::TransToPhiDataType(framework::proto::VarType::INT32));
     shape_on_cpu.Resize({in_dims.size()});
     auto cpu_data = shape_on_cpu.mutable_data<int32_t>(platform::CPUPlace());
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
index 60a0162818c9d..f66ae5dc750fe 100644
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class ShapeNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
index 3cc025ca9ed64..488615f66325e 100644
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ b/paddle/fluid/operators/shard_index_op_npu.cc
@@ -18,7 +18,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class ShardIndexNPUKernel : public framework::OpKernel<T> {
  public:
@@ -67,17 +66,17 @@ class ShardIndexNPUKernel : public framework::OpKernel<T> {
     out->set_lod(in->lod());
     out->mutable_data<T>(place);
 
-    Tensor tmp(in->type());
+    phi::DenseTensor tmp(in->type());
     tmp.mutable_data<T>(framework::DDim({1}), place);
     FillNpuTensorWithConstant(&tmp, shard_size);
 
-    Tensor condition(experimental::DataType::BOOL);
+    phi::DenseTensor condition(experimental::DataType::BOOL);
     condition.mutable_data<bool>(in->dims(), place);
 
-    Tensor tmp2(in->type());
+    phi::DenseTensor tmp2(in->type());
     tmp2.mutable_data<T>(in->dims(), place);
 
-    Tensor tmp3(in->type());
+    phi::DenseTensor tmp3(in->type());
     tmp3.mutable_data<T>(in->dims(), place);
 
     auto stream =
@@ -103,7 +102,7 @@ class ShardIndexNPUKernel : public framework::OpKernel<T> {
     runner2.SetType("Equal");
     runner2.Run(stream);
 
-    Tensor tmp4(in->type());
+    phi::DenseTensor tmp4(in->type());
     tmp4.mutable_data<T>(in->dims(), place);
     FillNpuTensorWithConstant(&tmp4, ignore_value);
     tmp4.Resize(in->dims());
diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h
index 2f1fbee16e3d9..4bc1289bf468c 100644
--- a/paddle/fluid/operators/shuffle_batch_op.h
+++ b/paddle/fluid/operators/shuffle_batch_op.h
@@ -32,7 +32,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 using Vector = framework::Vector<T>;
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 4869a4c6c5e22..6aa59becb1d6f 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaximumNumBlocks = 4096;
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
index d77724281327c..431a36d414c99 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_mlu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index ea3f119a05a91..df4270b6f23bc 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/similarity_focus_op.h b/paddle/fluid/operators/similarity_focus_op.h
index 8c055c2323c84..e706da9e01419 100644
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 template <typename T>
 class SimilarityFocusKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index d6f48d334759d..a418719907872 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SliceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -181,7 +179,7 @@ class SliceOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor") {
       return expected_kernel_type;
@@ -349,7 +347,7 @@ class SliceOpGrad : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
index 1935e2d0c9b14..771fca6a5ef18 100644
--- a/paddle/fluid/operators/slice_op_mlu.cc
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SliceMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 13ad263575698..59d6e2c2e42c1 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 void UpdateAttr(const framework::DDim& in_dims,
@@ -199,7 +198,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
       paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
     }
 
-    Tensor tmp_dout;
+    phi::DenseTensor tmp_dout;
     tmp_dout.ShareDataWith(*dout);
     auto out_dims = dout->dims();
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
index 3cc565ef91203..e11f629d86dda 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -79,7 +78,7 @@ class SmoothL1LossKernel : public framework::OpKernel<T> {
     }
 
     auto in_counts = in0->numel();
-    Tensor ptensor_errors;
+    phi::DenseTensor ptensor_errors;
     ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
                                    context.GetPlace());
     auto errors = EigenVector<T>::Flatten(ptensor_errors);
@@ -138,7 +137,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
     auto mat_dims =
         phi::make_ddim({static_cast<int>(in_dims[0]), static_cast<int>(cols)});
 
-    Tensor ptensor_diff;
+    phi::DenseTensor ptensor_diff;
     ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
                                  context.GetPlace());
     auto diff = EigenVector<T>::Flatten(ptensor_diff);
@@ -147,7 +146,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel<T> {
         SmoothL1LossBackward<T>(sigma2));
 
     // compute weights
-    Tensor ptensor_weights;
+    phi::DenseTensor ptensor_weights;
     ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
     auto weights = EigenMatrix<T>::From(ptensor_weights);
     // initialize to 1.0
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
index 1a4fb14bbb0b6..811a016c6515c 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
@@ -42,12 +42,12 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
     const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {});
     runner1.Run(stream);
 
-    Tensor no_reduce_loss(in_x->dtype());
+    phi::DenseTensor no_reduce_loss(in_x->dtype());
     no_reduce_loss.Resize(in_x->dims());
     no_reduce_loss.mutable_data<T>(context.GetPlace());
     // multiply inside weight before get the loss
     if (has_weight) {
-      Tensor tmp_diff(out_diff->dtype());
+      phi::DenseTensor tmp_diff(out_diff->dtype());
       tmp_diff.Resize(out_diff->dims());
       tmp_diff.mutable_data<T>(context.GetPlace());
       const auto& runner2 =
@@ -59,11 +59,11 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
           context.template device_context<paddle::platform::NPUDeviceContext>(),
           out_diff);
 
-      Tensor tmp_x(in_x->dtype());
+      phi::DenseTensor tmp_x(in_x->dtype());
       tmp_x.Resize(in_x->dims());
       tmp_x.mutable_data<T>(context.GetPlace());
 
-      Tensor tmp_y(in_y->dtype());
+      phi::DenseTensor tmp_y(in_y->dtype());
       tmp_y.Resize(in_y->dims());
       tmp_y.mutable_data<T>(context.GetPlace());
 
@@ -90,7 +90,7 @@ class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
     // multiply outside weight and loss
     // reduceSum because the output'shape must be [B,1]
     if (has_weight) {
-      Tensor tmp_loss(no_reduce_loss.dtype());
+      phi::DenseTensor tmp_loss(no_reduce_loss.dtype());
       tmp_loss.Resize(no_reduce_loss.dims());
       tmp_loss.mutable_data<T>(context.GetPlace());
       const auto& runner4 =
@@ -134,13 +134,13 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // diff == in_x - in_y == diff - 0
-    Tensor tmp_zero(diff->dtype());
+    phi::DenseTensor tmp_zero(diff->dtype());
     tmp_zero.Resize(diff->dims());
     tmp_zero.mutable_data<T>(context.GetPlace());
     const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {});
     runner_zero.Run(stream);
 
-    Tensor grad(diff->dtype());
+    phi::DenseTensor grad(diff->dtype());
     grad.Resize(diff->dims());
     grad.mutable_data<T>(context.GetPlace());
     // broadcast og(output_grad) to adapt to the npu interface
@@ -151,7 +151,7 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
                     {{"shape", phi::vectorize(diff->dims())}});
     runner_broad.Run(stream);
 
-    Tensor gradient(diff->dtype());
+    phi::DenseTensor gradient(diff->dtype());
     gradient.Resize(diff->dims());
     gradient.mutable_data<T>(context.GetPlace());
     // diff == diff - 0 == in_x - in_y
@@ -163,14 +163,14 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
 
     // mul weight and gradient
     if (has_weight) {
-      Tensor weight(inside_weight->dtype());
+      phi::DenseTensor weight(inside_weight->dtype());
       weight.Resize(inside_weight->dims());
       weight.mutable_data<T>(context.GetPlace());
       const auto& runner_weight =
           NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {});
       runner_weight.Run(stream);
 
-      Tensor tmp_grad(gradient.dtype());
+      phi::DenseTensor tmp_grad(gradient.dtype());
       tmp_grad.Resize(gradient.dims());
       tmp_grad.mutable_data<T>(context.GetPlace());
       const auto& runner_weight_grad =
@@ -196,7 +196,7 @@ class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
     // outy_grad = - gradient
     if (outy_grad) {
       outy_grad->mutable_data<T>(context.GetPlace());
-      Tensor coeff(experimental::DataType::FLOAT32);
+      phi::DenseTensor coeff(experimental::DataType::FLOAT32);
       coeff.mutable_data<float>({1}, context.GetPlace());
       FillNpuTensorWithConstant<float>(&coeff, -1);
       const auto& runner_y_grad =
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
index 91333b3393000..87d788b478367 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index d42f993f46219..6a51198e75460 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
  public:
@@ -61,7 +59,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     backprop->mutable_data<T>(ctx.GetPlace());
     softmax->mutable_data<T>(ctx.GetPlace());
 
-    Tensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
+    phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
     logits_2d.ShareDataWith(*logits).Resize({n, d});
     labels_1d.ShareDataWith(*labels).Resize({n});
     loss_1d.ShareDataWith(*loss).Resize({n});
@@ -110,7 +108,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
     const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
 
-    Tensor logits_grad_2d, loss_grad_1d, backprop_2d;
+    phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d;
 
     logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d});
     loss_grad_1d.ShareDataWith(*loss_grad).Resize({n});
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index 6cc8d0f79be4e..0d4af9c0ce94a 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class SpaceToDepthOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index b03a0b6c84e71..c0ad3c9a57823 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -203,7 +203,6 @@ __global__ void BlockSparseSoftmaxBackward(T* dst,
   }
 }
 
-using Tensor = phi::DenseTensor;
 /*
 input: sparse C in CSR format (num_rows,num_rows)
 output: sparse C after softmax operation
@@ -641,7 +640,7 @@ void DotDsd(const phi::GPUContext& ctx,
   platform::dynload::cusparseDestroy(handle);
 }
 
-std::vector<Tensor> GetSplitTensor(phi::DenseTensor* input) {
+std::vector<phi::DenseTensor> GetSplitTensor(phi::DenseTensor* input) {
   auto dims = input->dims();
   int batch_size = dims[0];
   int num_heads = dims[1];
@@ -687,14 +686,16 @@ class SparseAttentionCUDAKernel : public framework::OpKernel<T> {
     int M = query_dims[2];
     int N = query_dims[3];
 
-    std::vector<Tensor> query_lists = GetSplitTensor(&query);
-    std::vector<Tensor> key_lists = GetSplitTensor(&key);
-    std::vector<Tensor> value_lists = GetSplitTensor(&value);
-    std::vector<Tensor> offset_lists = GetSplitTensor(&offset);
-    std::vector<Tensor> columns_lists = GetSplitTensor(&columns);
-    std::vector<Tensor> result_sdd_lists = GetSplitTensor(&result_sdd);
-    std::vector<Tensor> result_softmax_lists = GetSplitTensor(&result_softmax);
-    std::vector<Tensor> output_lists = GetSplitTensor(&output);
+    std::vector<phi::DenseTensor> query_lists = GetSplitTensor(&query);
+    std::vector<phi::DenseTensor> key_lists = GetSplitTensor(&key);
+    std::vector<phi::DenseTensor> value_lists = GetSplitTensor(&value);
+    std::vector<phi::DenseTensor> offset_lists = GetSplitTensor(&offset);
+    std::vector<phi::DenseTensor> columns_lists = GetSplitTensor(&columns);
+    std::vector<phi::DenseTensor> result_sdd_lists =
+        GetSplitTensor(&result_sdd);
+    std::vector<phi::DenseTensor> result_softmax_lists =
+        GetSplitTensor(&result_softmax);
+    std::vector<phi::DenseTensor> output_lists = GetSplitTensor(&output);
 
     const auto& dev_ctx = ctx.cuda_device_context();
     const int iter_num = batch_size * num_heads;
@@ -802,17 +803,18 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
     int M = query_dims[2];
     int N = query_dims[3];
 
-    std::vector<Tensor> query_lists = GetSplitTensor(&query);
-    std::vector<Tensor> key_lists = GetSplitTensor(&key);
-    std::vector<Tensor> value_lists = GetSplitTensor(&value);
-    std::vector<Tensor> offset_lists = GetSplitTensor(&offset);
-    std::vector<Tensor> columns_lists = GetSplitTensor(&columns);
-    std::vector<Tensor> sparse_dot_sdd_lists = GetSplitTensor(&sparse_dot_sdd);
-    std::vector<Tensor> softmax_lists = GetSplitTensor(&softmax);
-    std::vector<Tensor> dout_lists = GetSplitTensor(&dout);
-    std::vector<Tensor> dquery_lists = GetSplitTensor(&dquery);
-    std::vector<Tensor> dkey_lists = GetSplitTensor(&dkey);
-    std::vector<Tensor> dvalue_lists = GetSplitTensor(&dvalue);
+    std::vector<phi::DenseTensor> query_lists = GetSplitTensor(&query);
+    std::vector<phi::DenseTensor> key_lists = GetSplitTensor(&key);
+    std::vector<phi::DenseTensor> value_lists = GetSplitTensor(&value);
+    std::vector<phi::DenseTensor> offset_lists = GetSplitTensor(&offset);
+    std::vector<phi::DenseTensor> columns_lists = GetSplitTensor(&columns);
+    std::vector<phi::DenseTensor> sparse_dot_sdd_lists =
+        GetSplitTensor(&sparse_dot_sdd);
+    std::vector<phi::DenseTensor> softmax_lists = GetSplitTensor(&softmax);
+    std::vector<phi::DenseTensor> dout_lists = GetSplitTensor(&dout);
+    std::vector<phi::DenseTensor> dquery_lists = GetSplitTensor(&dquery);
+    std::vector<phi::DenseTensor> dkey_lists = GetSplitTensor(&dkey);
+    std::vector<phi::DenseTensor> dvalue_lists = GetSplitTensor(&dvalue);
 
     const int iter_num = batch_size * num_heads;
     const auto& dev_ctx = ctx.cuda_device_context();
@@ -831,7 +833,7 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
 
       // dSoftmax = dOut * transpose(Value)
       int nnz_num = columns_lists[i].numel();
-      Tensor dsoftmax;
+      phi::DenseTensor dsoftmax;
       dsoftmax.Resize({nnz_num});
       dsoftmax.mutable_data<T>(ctx.GetPlace());
       DotSdd<DeviceContext, T>(dev_ctx,
@@ -846,7 +848,7 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
                                true);
 
       // dSparseDotSdd = dSoftmax * softmax'(SparseDotSdd)
-      Tensor dsparse_dot_sdd;
+      phi::DenseTensor dsparse_dot_sdd;
       dsparse_dot_sdd.Resize({nnz_num});
       dsparse_dot_sdd.mutable_data<T>(ctx.GetPlace());
       SparseSoftmaxBackward<DeviceContext, T>(dev_ctx,
diff --git a/paddle/fluid/operators/split_op_mlu.cc b/paddle/fluid/operators/split_op_mlu.cc
index cda18720e7aba..77928c7efc8da 100644
--- a/paddle/fluid/operators/split_op_mlu.cc
+++ b/paddle/fluid/operators/split_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SplitMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
index 2fa8fa2a805eb..966f2ea6849b9 100644
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SplitNPUKernel : public framework::OpKernel<T> {
  public:
@@ -44,7 +42,7 @@ class SplitNPUKernel : public framework::OpKernel<T> {
           "The SectionsTensorList is not supported on NPU now."));
     }
 
-    std::vector<Tensor> outputs;
+    std::vector<phi::DenseTensor> outputs;
     for (size_t j = 0; j < outs.size(); ++j) {
       outs[j]->mutable_data<T>(ctx.GetPlace());
       outputs.push_back(*outs[j]);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
index 1698c65fc47ac..f0838c4fad2de 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
index fcd83b40875ec..0c558502ddf65 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
  public:
@@ -82,7 +80,7 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
     auto place = context.GetPlace();
 
     // broadcast out_grad
-    Tensor broadcasted_out_grad;
+    phi::DenseTensor broadcasted_out_grad;
     broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
     MLUCnnlTensorDesc broadcasted_out_grad_desc(broadcasted_out_grad);
     MLUCnnlTensorDesc out_grad_desc(*out_grad);
@@ -93,7 +91,7 @@ class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
                          GetBasePtr(&broadcasted_out_grad));
 
     // mul x
-    Tensor tmp_x_grad;
+    phi::DenseTensor tmp_x_grad;
     tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc tmp_x_grad_desc(tmp_x_grad);
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
index 25260ed4c1286..0cebf8e59d6a6 100644
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
  public:
@@ -65,7 +63,7 @@ class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // broadcast out_grad
-    Tensor broadcasted_out_grad;
+    phi::DenseTensor broadcasted_out_grad;
     broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
     const auto &broadcast_runner =
         NpuOpRunner("BroadcastToD",
@@ -74,13 +72,13 @@ class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
                     {{"shape", phi::vectorize(x_grad->dims())}});
     broadcast_runner.Run(stream);
     // mul x
-    Tensor tmp_x_grad;
+    phi::DenseTensor tmp_x_grad;
     tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
     const auto &mul_x_runner =
         NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {});
     mul_x_runner.Run(stream);
     // mul coefficient:2
-    Tensor coefficient;
+    phi::DenseTensor coefficient;
     coefficient.mutable_data<T>({1}, place);
     FillNpuTensorWithConstant<T>(&coefficient, static_cast<T>(2.0));
     x_grad->mutable_data<T>(place);
diff --git a/paddle/fluid/operators/stack_op_mlu.cc b/paddle/fluid/operators/stack_op_mlu.cc
index eeac200676f4a..16076a180a54e 100644
--- a/paddle/fluid/operators/stack_op_mlu.cc
+++ b/paddle/fluid/operators/stack_op_mlu.cc
@@ -19,8 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class StackMLUKernel : public framework::OpKernel<T> {
  public:
@@ -31,10 +29,10 @@ class StackMLUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     std::vector<MLUCnnlTensorDesc> x_descs;
     std::vector<cnnlTensorDescriptor_t> x_raw_descs;
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 3b5c0b1dc0cb6..7919294f60c33 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class StackNPUKernel : public framework::OpKernel<T> {
  public:
@@ -30,10 +28,10 @@ class StackNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += (x[0]->dims().size() + 1);
     int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -61,10 +59,10 @@ class StackGradNPUKernel : public framework::OpKernel<T> {
     if (axis < 0) axis += dy->dims().size();
     int num = dy->dims()[axis];
 
-    PADDLE_ENFORCE_GT(
-        num,
-        0,
-        platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "number of input phi::DenseTensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/stft_op.h b/paddle/fluid/operators/stft_op.h
index 23130f687e305..b0d6091ecd37b 100644
--- a/paddle/fluid/operators/stft_op.h
+++ b/paddle/fluid/operators/stft_op.h
@@ -27,8 +27,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class StftKernel : public framework::OpKernel<T> {
  public:
@@ -59,7 +57,7 @@ class StftKernel : public framework::OpKernel<T> {
     std::vector<int64_t> axes = {1};
 
     // Frame
-    Tensor frames;
+    phi::DenseTensor frames;
     framework::DDim frames_dims(out->dims());
     frames_dims.at(axes.back()) = n_fft;
     frames.mutable_data<T>(frames_dims, ctx.GetPlace());
@@ -73,7 +71,7 @@ class StftKernel : public framework::OpKernel<T> {
                                                  /*is_grad*/ false);
 
     // Window
-    Tensor frames_w;
+    phi::DenseTensor frames_w;
     frames_w.mutable_data<T>(frames_dims, ctx.GetPlace());
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
         ctx, &frames, window, axes.back(), MulFunctor<T>(), &frames_w);
@@ -93,7 +91,7 @@ class StftKernel : public framework::OpKernel<T> {
       framework::DDim onesided_dims(out->dims());
       const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
       onesided_dims.at(axes.back()) = onesided_axis_size;
-      Tensor onesided_out;
+      phi::DenseTensor onesided_out;
       onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
       fft_r2c_func(dev_ctx, frames_w, &onesided_out, axes, normalization, true);
       phi::funcs::FFTFillConj<DeviceContext, C>(
@@ -125,12 +123,12 @@ class StftGradKernel : public framework::OpKernel<T> {
     const int seq_length = dx->dims()[dx_rank - 1];
 
     std::vector<int64_t> axes = {1};
-    Tensor d_frames_w;
+    phi::DenseTensor d_frames_w;
     framework::DDim d_frames_dims(dy->dims());
     d_frames_dims.at(axes.back()) = n_fft;
     d_frames_w.mutable_data<T>(d_frames_dims, ctx.GetPlace());
 
-    Tensor complex_d_frames_w;
+    phi::DenseTensor complex_d_frames_w;
     complex_d_frames_w.mutable_data<C>(d_frames_dims, ctx.GetPlace());
 
     // dy -> d_frames_w
@@ -146,7 +144,7 @@ class StftGradKernel : public framework::OpKernel<T> {
       fft_c2c_func(
           dev_ctx, *dy, &complex_d_frames_w, axes, normalization, false);
     } else {
-      Tensor full_dy;
+      phi::DenseTensor full_dy;
       full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
       auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
                                           dy->dims().at(axes.back()));
@@ -163,7 +161,7 @@ class StftGradKernel : public framework::OpKernel<T> {
     phi::RealKernel<C>(dev_ctx, complex_d_frames_w, &d_frames_w);
 
     // d_frames_w -> d_frames
-    Tensor d_frames;
+    phi::DenseTensor d_frames;
     d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
     ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(
         ctx, &d_frames_w, window, axes.back(), MulFunctor<T>(), &d_frames);
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index a91b210f2dc7b..c08f214ab58bc 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -26,8 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class StridedSliceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -69,7 +67,7 @@ class StridedSliceOp : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor" ||
         var_name == "StridesTensor") {
@@ -174,7 +172,7 @@ class StridedSliceOpGrad : public framework::OperatorWithKernel {
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "StartsTensor" || var_name == "EndsTensor" ||
         var_name == "StridesTensor") {
diff --git a/paddle/fluid/operators/strided_slice_op_mlu.cc b/paddle/fluid/operators/strided_slice_op_mlu.cc
index 6caf1ad5ad15f..21eb47f187b00 100644
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Variable = framework::Variable;
 using LoDTensorArray = framework::LoDTensorArray;
 using DDim = framework::DDim;
@@ -100,7 +99,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
@@ -156,7 +155,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
-    // vector<Tensor<int32>>
+    // vector<phi::DenseTensor<int32>>
     auto list_new_starts_tensor =
         ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
     auto list_new_ends_tensor =
@@ -164,7 +163,7 @@ class StridedSliceMLUKernel : public framework::OpKernel<T> {
     auto list_new_strides_tensor =
         ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
-    // Tensor<int32>
+    // phi::DenseTensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
@@ -268,7 +267,7 @@ class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
index f613dc1054088..23bf6ea689602 100644
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ b/paddle/fluid/operators/strided_slice_op_npu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Variable = framework::Variable;
 using LoDTensorArray = framework::LoDTensorArray;
 using DDim = framework::DDim;
@@ -34,7 +33,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
     switch (rank) {
       case 1:
@@ -87,7 +86,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
 
-    // vector<Tensor<int32>>
+    // vector<phi::DenseTensor<int32>>
     auto list_new_ends_tensor =
         ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
     auto list_new_starts_tensor =
@@ -95,7 +94,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     auto list_new_strides_tensor =
         ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
 
-    // Tensor<int32>
+    // phi::DenseTensor<int32>
     if (list_new_starts_tensor.size() > 0) {
       starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
     } else if (ctx.HasInput("StartsTensor")) {
@@ -157,9 +156,9 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
       strides_indices_vector[axis_index] = strides[axis];
     }
 
-    Tensor starts_indices_tensor;
-    Tensor ends_indices_tensor;
-    Tensor strides_indices_tensor;
+    phi::DenseTensor starts_indices_tensor;
+    phi::DenseTensor ends_indices_tensor;
+    phi::DenseTensor strides_indices_tensor;
 
     starts_indices_tensor.mutable_data<int64_t>({D}, place);
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
@@ -221,7 +220,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
     runner.Run(stream);
 
     if (need_reverse) {
-      Tensor out_tmp;
+      phi::DenseTensor out_tmp;
       out_tmp.mutable_data<T>(out_dims, place);
       paddle::framework::TensorCopy(
           *out,
@@ -229,7 +228,7 @@ class StridedSliceNPUKernel : public framework::OpKernel<T> {
           ctx.template device_context<platform::DeviceContext>(),
           &out_tmp);
 
-      Tensor reverse_axis;
+      phi::DenseTensor reverse_axis;
       std::vector<int> reverse_axis_vector;
       for (size_t axis = 0; axis < axes.size(); axis++) {
         if (reverse_vector[axis] == 1) {
@@ -261,7 +260,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(is_tensor_array,
                       false,
                       platform::errors::InvalidArgument(
-                          "Tensor array as input is not supported."));
+                          "phi::DenseTensor array as input is not supported."));
     int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
 
     switch (rank) {
@@ -378,9 +377,9 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
       strides_indices_vector[axis_index] = strides[axis];
     }
 
-    Tensor starts_indices_tensor;
-    Tensor ends_indices_tensor;
-    Tensor strides_indices_tensor;
+    phi::DenseTensor starts_indices_tensor;
+    phi::DenseTensor ends_indices_tensor;
+    phi::DenseTensor strides_indices_tensor;
 
     starts_indices_tensor.mutable_data<int64_t>({D}, place);
     ends_indices_tensor.mutable_data<int64_t>({D}, place);
@@ -397,7 +396,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
     for (int i = 0; i < input_dims.size(); i++) {
       input_dims_vector.push_back(input_dims[i]);
     }
-    Tensor input_dims_tensor;
+    phi::DenseTensor input_dims_tensor;
     paddle::framework::TensorFromVector(
         input_dims_vector, dev_ctx, &input_dims_tensor);
 
@@ -417,7 +416,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
                                              {"shrink_axis_mask", 0}};
 
     if (need_reverse) {
-      Tensor reverse_axis;
+      phi::DenseTensor reverse_axis;
       std::vector<int> reverse_axis_vector;
       for (size_t axis = 0; axis < axes.size(); axis++) {
         if (reverse_vector[axis] == 1) {
@@ -429,7 +428,7 @@ class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
       paddle::framework::TensorFromVector(
           reverse_axis_vector, dev_ctx, &reverse_axis);
 
-      Tensor dout_tmp;
+      phi::DenseTensor dout_tmp;
       dout_tmp.mutable_data<T>(dout->dims(), place);
       const auto& runner_reverse =
           NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp});
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index aad62e9ce2c33..a2f69a394902c 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename DeviceContext, typename T>
@@ -62,7 +61,7 @@ class SumMLUKernel : public framework::OpKernel<T> {
 
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor or But got "
+          "Expected type of Output(out) must be phi::DenseTensor or But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index 20cc7ec18b8b7..afc489e2ab412 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using SelectedRows = phi::SelectedRows;
 
 template <typename DeviceContext, typename T>
@@ -106,7 +105,7 @@ class SumNPUKernel : public framework::OpKernel<T> {
       }
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor or "
+          "Expected type of Output(out) must be phi::DenseTensor or "
           "LoDTensorArray. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index d6c306ff2a9f3..6358722e94390 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -36,7 +36,6 @@
 namespace paddle {
 namespace operators {
 namespace math {
-using Tensor = phi::DenseTensor;
 using InTensors = std::vector<const phi::DenseTensor*>;
 using OutTensors = std::vector<phi::DenseTensor*>;
 using OpName = std::string;
@@ -274,8 +273,8 @@ template <typename DeviceContext, typename T, typename ValueType = T>
 struct DeviceIndependenceTensorOperations {
   // 1. Device indenpendence, for kernel reuse.
   // 2. Input and output is always tensor type.
-  // 3. output Tensor is alway allocated
-  // 4. Basic Tensor operator is supported
+  // 3. output phi::DenseTensor is alway allocated
+  // 4. Basic phi::DenseTensor operator is supported
   // 5. The Reused Operator Kernel should only be considered as
   //    a wrap function
   using NameInTensorMap =
@@ -382,8 +381,8 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // batch_diag for CPU only
-  Tensor BatchDiag(const phi::DenseTensor& x, int batch) {
-    Tensor out;
+  phi::DenseTensor BatchDiag(const phi::DenseTensor& x, int batch) {
+    phi::DenseTensor out;
     auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
     auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
@@ -411,7 +410,8 @@ struct DeviceIndependenceTensorOperations {
   }
 
   // a complex number x times a real number y, which is represented as (a+0j)
-  Tensor RealMulComplex(const phi::DenseTensor& x, const phi::DenseTensor& y) {
+  phi::DenseTensor RealMulComplex(const phi::DenseTensor& x,
+                                  const phi::DenseTensor& y) {
     phi::DenseTensor ret;
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(phi::make_ddim(out_shape));
@@ -650,8 +650,8 @@ struct DeviceIndependenceTensorOperations {
     return CreateOpRunAndReturnTensor("concat", inputs, attrs, out_shape);
   }
 
-  Tensor Conj(const phi::DenseTensor& x) {
-    Tensor out;
+  phi::DenseTensor Conj(const phi::DenseTensor& x) {
+    phi::DenseTensor out;
     auto* out_data = out.mutable_data<T>(x.dims(), context.GetPlace());
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(x.numel());
@@ -660,8 +660,8 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  Tensor Real(const phi::DenseTensor& x) {
-    Tensor out;
+  phi::DenseTensor Real(const phi::DenseTensor& x) {
+    phi::DenseTensor out;
     auto numel = x.numel();
     auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(),
@@ -674,13 +674,13 @@ struct DeviceIndependenceTensorOperations {
     return out;
   }
 
-  Tensor DiagFill(const int m,
-                  const int n,
-                  const int num_lower_diags,
-                  const int num_upper_diags,
-                  const phi::DenseTensor& scale,
-                  const phi::DenseTensor& input) {
-    Tensor out;
+  phi::DenseTensor DiagFill(const int m,
+                            const int n,
+                            const int num_lower_diags,
+                            const int num_upper_diags,
+                            const phi::DenseTensor& scale,
+                            const phi::DenseTensor& input) {
+    phi::DenseTensor out;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, input.numel());
     DiagAndFillFunctor<T, ValueType> diag_and_copy_functor(
@@ -709,7 +709,7 @@ struct DeviceIndependenceTensorOperations {
                          const std::vector<int>& start,
                          const std::vector<int>& end,
                          phi::DenseTensor* out) {
-    // Slice by call Eigen Tensor Function `.slice()`
+    // Slice by call Eigen phi::DenseTensor Function `.slice()`
     size_t rank = in->dims().size();
     PADDLE_ENFORCE_EQ(start.size(),
                       rank,
@@ -752,7 +752,7 @@ struct DeviceIndependenceTensorOperations {
       op_outputs[out_name].emplace_back("tmp_" + out_name);
     }
     auto out_var = local_scope.Var("tmp_Out");  // return the Out
-    // create Out Tensor and allocat memory
+    // create Out phi::DenseTensor and allocat memory
     out_var->GetMutable<phi::DenseTensor>()->mutable_data<T>(
         phi::make_ddim(out_shape), context.GetPlace());
     // phi::make_ddim(out_shape)
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
index b1e6bec8a4cad..2d037a7c3ecc1 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -26,7 +26,6 @@ namespace operators {
 #define NO_USE_CNCL 0
 #define GET_LAYOUT_OFFSET 2
 
-using Tensor = phi::DenseTensor;
 static std::vector<cnnlTensorLayout_t> supported_input_layout = {
     CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
 
@@ -81,8 +80,8 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
     saved_mean->mutable_data<MPDType>(ctx.GetPlace());
     saved_variance->mutable_data<MPDType>(ctx.GetPlace());
 
-    Tensor trans_x;
-    Tensor trans_y;
+    phi::DenseTensor trans_x;
+    phi::DenseTensor trans_y;
     std::vector<int> forward_perm;
     std::vector<int> backward_perm;
     std::vector<int> trans_shape;
@@ -137,13 +136,13 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
-      Tensor local_mean, local_var;
+      phi::DenseTensor local_mean, local_var;
       local_mean.mutable_data<MPDType>(mean->dims(), ctx.GetPlace());
       local_var.mutable_data<MPDType>(variance->dims(), ctx.GetPlace());
       MLUCnnlTensorDesc desc_mean_var(*mean_out);
@@ -158,14 +157,14 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                                   desc_mean_var.get(),
                                   GetBasePtr(&local_var));
 
-      Tensor input_count;
+      phi::DenseTensor input_count;
       input_count.mutable_data<MPDType>(phi::make_ddim({1}), ctx.GetPlace());
       FillMLUTensorWithHostValue<MPDType>(
           ctx, static_cast<MPDType>(x->numel() / C), &input_count);
 
-      Tensor count_all;
-      Tensor mean_all(mean->dtype());
-      Tensor invstd_all(variance->dtype());
+      phi::DenseTensor count_all;
+      phi::DenseTensor mean_all(mean->dtype());
+      phi::DenseTensor invstd_all(variance->dtype());
 
 #ifdef PADDLE_WITH_CNCL
       auto &dev_ctx =
@@ -300,7 +299,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
     const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
     const auto *saved_inv_var = ctx.Input<phi::DenseTensor>("SavedVariance");
 
-    const Tensor *x;
+    const phi::DenseTensor *x;
     if (ctx.HasInput("Y")) {
       PADDLE_ENFORCE_EQ(true,
                         false,
@@ -342,9 +341,9 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
                           "OP(sync_batch_norm) be (1), but given (%d).",
                           scale->dims().size()));
 
-    Tensor trans_x;
-    Tensor trans_dy;
-    Tensor trans_dx;
+    phi::DenseTensor trans_x;
+    phi::DenseTensor trans_dy;
+    phi::DenseTensor trans_dx;
     std::vector<int> forward_perm;
     std::vector<int> backward_perm;
     std::vector<int> trans_shape;
@@ -384,7 +383,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
         supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
         ToCnnlDataType<T>());
 
-    Tensor sum_dy, sum_dy_xmu;
+    phi::DenseTensor sum_dy, sum_dy_xmu;
     sum_dy.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
     sum_dy_xmu.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
     MLUCnnlTensorDesc desc_other_param(*bias);
@@ -411,7 +410,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
         d_scale ? true : false /*compute d_scale*/,
         d_bias ? true : false /*compute d_bias*/);
 
-    Tensor numel_count;
+    phi::DenseTensor numel_count;
     numel_count.mutable_data<int32_t>(phi::make_ddim({1}), ctx.GetPlace());
     FillMLUTensorWithHostValue<int32_t>(
         ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index 6cfd753c4ab6e..46b1ccc140ddb 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -20,8 +20,6 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 void training_or_inference(const framework::ExecutionContext &ctx,
                            const aclrtStream &stream,
@@ -34,18 +32,18 @@ void training_or_inference(const framework::ExecutionContext &ctx,
                            const int &W,
                            const float epsilon,
                            const float &momentum,
-                           const Tensor *common_mean,
-                           const Tensor *common_var,
-                           const Tensor *x,
-                           const Tensor *scale,
-                           const Tensor *bias,
-                           const Tensor *mean,
-                           const Tensor *variance,
-                           Tensor *mean_out,
-                           Tensor *variance_out,
-                           Tensor *saved_mean,
-                           Tensor *saved_variance,
-                           Tensor *y) {
+                           const phi::DenseTensor *common_mean,
+                           const phi::DenseTensor *common_var,
+                           const phi::DenseTensor *x,
+                           const phi::DenseTensor *scale,
+                           const phi::DenseTensor *bias,
+                           const phi::DenseTensor *mean,
+                           const phi::DenseTensor *variance,
+                           phi::DenseTensor *mean_out,
+                           phi::DenseTensor *variance_out,
+                           phi::DenseTensor *saved_mean,
+                           phi::DenseTensor *saved_variance,
+                           phi::DenseTensor *y) {
   std::vector<int> axes;
   if (layout == phi::DataLayout::kNCHW) {
     axes = {0, 2, 3};
@@ -59,7 +57,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
   else if (layout == phi::DataLayout::kNHWC)
     multiples = {N, H, W, 1};
 
-  Tensor common_mean_tile_1;
+  phi::DenseTensor common_mean_tile_1;
   {
     common_mean_tile_1.Resize({C});
     common_mean_tile_1.mutable_data<float>(place);
@@ -70,7 +68,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       common_mean_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor common_mean_tile;
+  phi::DenseTensor common_mean_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     common_mean_tile.Resize(x->dims());
@@ -80,7 +78,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_1;
+  phi::DenseTensor common_var_tile_1;
   {
     common_var_tile_1.Resize({C});
     common_var_tile_1.mutable_data<float>(place);
@@ -91,7 +89,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       common_var_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor common_var_tile;
+  phi::DenseTensor common_var_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     common_var_tile.Resize(x->dims());
@@ -101,7 +99,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_add_epsilon;
+  phi::DenseTensor common_var_tile_add_epsilon;
   {
     framework::NPUAttributeMap attr_input = {{"value", epsilon}};
     common_var_tile_add_epsilon.Resize(x->dims());
@@ -111,7 +109,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor common_var_tile_add_epsilon_sqrt;
+  phi::DenseTensor common_var_tile_add_epsilon_sqrt;
   {
     common_var_tile_add_epsilon_sqrt.Resize(x->dims());
     common_var_tile_add_epsilon_sqrt.mutable_data<float>(place);
@@ -122,7 +120,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor x_sub_common_mean;
+  phi::DenseTensor x_sub_common_mean;
   {
     x_sub_common_mean.Resize(x->dims());
     x_sub_common_mean.mutable_data<float>(place);
@@ -131,7 +129,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor normalized;
+  phi::DenseTensor normalized;
   {
     normalized.Resize(x->dims());
     normalized.mutable_data<float>(place);
@@ -143,7 +141,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor scale_tile_1;
+  phi::DenseTensor scale_tile_1;
   {
     scale_tile_1.Resize({C});
     scale_tile_1.mutable_data<float>(place);
@@ -154,7 +152,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       scale_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor scale_tile;
+  phi::DenseTensor scale_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     scale_tile.Resize(x->dims());
@@ -164,7 +162,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor normalized_mul_scale;
+  phi::DenseTensor normalized_mul_scale;
   {
     normalized_mul_scale.Resize(x->dims());
     normalized_mul_scale.mutable_data<float>(place);
@@ -173,7 +171,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
     runner.Run(stream);
   }
 
-  Tensor bias_tile_1;
+  phi::DenseTensor bias_tile_1;
   {
     bias_tile_1.Resize({C});
     bias_tile_1.mutable_data<float>(place);
@@ -184,7 +182,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
       bias_tile_1.Resize({1, 1, 1, C});
   }
 
-  Tensor bias_tile;
+  phi::DenseTensor bias_tile;
   {
     framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
     bias_tile.Resize(x->dims());
@@ -203,7 +201,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
   }
 
   if (!test_mode) {
-    Tensor ones;
+    phi::DenseTensor ones;
     {
       ones.Resize({C});
       ones.mutable_data<float>(place);
@@ -212,7 +210,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl mean_out
     {
-      Tensor common_mean_mul_1_sub_momentum;
+      phi::DenseTensor common_mean_mul_1_sub_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
         common_mean_mul_1_sub_momentum.Resize({C});
@@ -224,7 +222,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor mean_mul_momentum;
+      phi::DenseTensor mean_mul_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", momentum}};
         mean_mul_momentum.Resize({C});
@@ -246,7 +244,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl variance_out
     {
-      Tensor momentum_mul_var;
+      phi::DenseTensor momentum_mul_var;
       {
         framework::NPUAttributeMap attr_input = {{"value", momentum}};
         momentum_mul_var.Resize({C});
@@ -256,7 +254,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor var_ref_mul_1_sub_momentum;
+      phi::DenseTensor var_ref_mul_1_sub_momentum;
       {
         framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
         var_ref_mul_1_sub_momentum.Resize({C});
@@ -278,7 +276,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
 
     // cacl saved_variance
     {
-      Tensor var_ref_add_epsilon;
+      phi::DenseTensor var_ref_add_epsilon;
       {
         framework::NPUAttributeMap attr_input = {{"value", epsilon}};
         var_ref_add_epsilon.Resize({C});
@@ -288,7 +286,7 @@ void training_or_inference(const framework::ExecutionContext &ctx,
         runner.Run(stream);
       }
 
-      Tensor var_ref_add_epsilon_sqrt;
+      phi::DenseTensor var_ref_add_epsilon_sqrt;
       {
         var_ref_add_epsilon_sqrt.Resize({C});
         var_ref_add_epsilon_sqrt.mutable_data<float>(place);
@@ -399,18 +397,18 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
     } else {  // training
       if (ctx.HasInput("MomentumTensor")) {
         const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        Tensor mom_cpu;
+        phi::DenseTensor mom_cpu;
         paddle::framework::TensorCopySync(
             *mom_tensor, platform::CPUPlace(), &mom_cpu);
         momentum = mom_cpu.data<float>()[0];
       }
 
       // cacl saved_mean and var_ref
-      Tensor var_ref;
+      phi::DenseTensor var_ref;
       var_ref.Resize({C});
       var_ref.mutable_data<float>(place);
       {
-        Tensor x_sum;
+        phi::DenseTensor x_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -421,7 +419,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square;
+        phi::DenseTensor x_square;
         {
           x_square.Resize(x->dims());
           x_square.mutable_data<float>(place);
@@ -429,7 +427,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum;
+        phi::DenseTensor x_square_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -447,7 +445,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
           HcclDataType dtype = platform::ToHCCLDataType(
               framework::TransToProtoVarType(mean_out->dtype()));
 
-          Tensor device_count_tensor;
+          phi::DenseTensor device_count_tensor;
           {
             device_count_tensor.Resize({1});
             device_count_tensor.mutable_data<float>(place);
@@ -517,7 +515,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
 
         // cacl var_ref
         {
-          Tensor saved_mean_square;
+          phi::DenseTensor saved_mean_square;
           {
             saved_mean_square.Resize({C});
             saved_mean_square.mutable_data<float>(place);
@@ -526,7 +524,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
             runner.Run(stream);
           }
 
-          Tensor var_ref_tmp;
+          phi::DenseTensor var_ref_tmp;
           var_ref_tmp.Resize({C});
           var_ref_tmp.mutable_data<float>(place);
           {
@@ -589,7 +587,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
     const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
 
-    const Tensor *x;
+    const phi::DenseTensor *x;
     if (ctx.HasInput("Y")) {
       PADDLE_ENFORCE_EQ(true,
                         false,
@@ -627,7 +625,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     float device_counts = 0.0;
     if (comm) {
-      Tensor device_count_tensor;
+      phi::DenseTensor device_count_tensor;
       {
         device_count_tensor.Resize({1});
         device_count_tensor.mutable_data<float>(place);
@@ -660,13 +658,13 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     }
 
     // cacl var_ref
-    Tensor var_ref;
+    phi::DenseTensor var_ref;
     var_ref.Resize({C});
     var_ref.mutable_data<float>(place);
     {
       // cacl var_ref
       {
-        Tensor x_square;
+        phi::DenseTensor x_square;
         {
           x_square.Resize(x->dims());
           x_square.mutable_data<float>(place);
@@ -674,7 +672,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum;
+        phi::DenseTensor x_square_sum;
         {
           framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -685,7 +683,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor x_square_sum_mean;
+        phi::DenseTensor x_square_sum_mean;
         {
           framework::NPUAttributeMap attr_input = {
               {"value", 1.0f * C / x_numel}};
@@ -696,7 +694,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           runner.Run(stream);
         }
 
-        Tensor mean_square;
+        phi::DenseTensor mean_square;
         {
           mean_square.Resize({C});
           mean_square.mutable_data<float>(place);
@@ -714,7 +712,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor saved_mean_tile_1;
+    phi::DenseTensor saved_mean_tile_1;
     {
       saved_mean_tile_1.Resize({C});
       saved_mean_tile_1.mutable_data<float>(place);
@@ -725,7 +723,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         saved_mean_tile_1.Resize({1, 1, 1, C});
     }
 
-    Tensor saved_mean_tile;
+    phi::DenseTensor saved_mean_tile;
     {
       framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
       saved_mean_tile.Resize(x->dims());
@@ -735,7 +733,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor x_sub_saved_mean;
+    phi::DenseTensor x_sub_saved_mean;
     {
       x_sub_saved_mean.Resize(x->dims());
       x_sub_saved_mean.mutable_data<float>(place);
@@ -744,7 +742,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_1;
+    phi::DenseTensor var_ref_tile_1;
     {
       var_ref_tile_1.Resize({C});
       var_ref_tile_1.mutable_data<float>(place);
@@ -755,7 +753,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         var_ref_tile_1.Resize({1, 1, 1, C});
     }
 
-    Tensor var_ref_tile;
+    phi::DenseTensor var_ref_tile;
     {
       framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
       var_ref_tile.Resize(x->dims());
@@ -765,7 +763,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_add_epsilon;
+    phi::DenseTensor var_ref_tile_add_epsilon;
     {
       framework::NPUAttributeMap attr_input = {{"value", epsilon}};
       var_ref_tile_add_epsilon.Resize(x->dims());
@@ -775,7 +773,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor var_ref_tile_add_epsilon_sqrt;
+    phi::DenseTensor var_ref_tile_add_epsilon_sqrt;
     {
       var_ref_tile_add_epsilon_sqrt.Resize(x->dims());
       var_ref_tile_add_epsilon_sqrt.mutable_data<float>(place);
@@ -786,7 +784,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       runner.Run(stream);
     }
 
-    Tensor dy_mul_x_sub_mean_for_scale;
+    phi::DenseTensor dy_mul_x_sub_mean_for_scale;
     {
       if (framework::TransToProtoVarType(d_y->dtype()) ==
           framework::proto::VarType::FP16) {
@@ -804,7 +802,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       }
     }
 
-    Tensor dy_mul_x_sub_mean;
+    phi::DenseTensor dy_mul_x_sub_mean;
     {
       if (framework::TransToProtoVarType(d_y->dtype()) ==
           framework::proto::VarType::FP16) {
@@ -849,7 +847,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     // cacl d_x
     if (d_x) {
-      Tensor dy_mean;
+      phi::DenseTensor dy_mean;
       {
         if (framework::TransToProtoVarType(d_y->dtype()) ==
             framework::proto::VarType::FP16) {
@@ -896,7 +894,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Tensor dy_mean_tile_1;
+      phi::DenseTensor dy_mean_tile_1;
       {
         dy_mean_tile_1.Resize({C});
         dy_mean_tile_1.mutable_data<float>(place);
@@ -907,7 +905,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           dy_mean_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor dy_mean_tile;
+      phi::DenseTensor dy_mean_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         dy_mean_tile.Resize(x->dims());
@@ -917,7 +915,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor dy_sub_dy_mean;
+      phi::DenseTensor dy_sub_dy_mean;
       {
         if (framework::TransToProtoVarType(d_y->dtype()) ==
             framework::proto::VarType::FP16) {
@@ -935,7 +933,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Tensor dy_mul_x_sub_mean_mean;
+      phi::DenseTensor dy_mul_x_sub_mean_mean;
       {
         framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                  {"axes", axes}};
@@ -948,7 +946,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor dy_mul_x_sub_mean_mean_tile_1;
+      phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1;
       {
         dy_mul_x_sub_mean_mean_tile_1.Resize({C});
         dy_mul_x_sub_mean_mean_tile_1.mutable_data<float>(place);
@@ -960,7 +958,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor dy_mul_x_sub_mean_mean_tile;
+      phi::DenseTensor dy_mul_x_sub_mean_mean_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         dy_mul_x_sub_mean_mean_tile.Resize(x->dims());
@@ -974,7 +972,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
       // (x - mean) * np.mean(dy * (x - mean), axis=axis)
       // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile
-      Tensor tmp1;
+      phi::DenseTensor tmp1;
       {
         tmp1.Resize(x->dims());
         tmp1.mutable_data<float>(place);
@@ -986,7 +984,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon)
       // tmp1 / (var + epsilon)
       // tmp1 / var_ref_tile_add_epsilon
-      Tensor tmp2;
+      phi::DenseTensor tmp2;
       {
         tmp2.Resize(x->dims());
         tmp2.mutable_data<float>(place);
@@ -998,7 +996,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) /
       // (var + epsilon)
       // dy_sub_dy_mean - tmp2
-      Tensor tmp3;
+      phi::DenseTensor tmp3;
       {
         tmp3.Resize(x->dims());
         tmp3.mutable_data<float>(place);
@@ -1007,7 +1005,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       }
 
-      Tensor scale_tile_1;
+      phi::DenseTensor scale_tile_1;
       {
         scale_tile_1.Resize({C});
         scale_tile_1.mutable_data<float>(place);
@@ -1018,7 +1016,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
           scale_tile_1.Resize({1, 1, 1, C});
       }
 
-      Tensor scale_tile;
+      phi::DenseTensor scale_tile;
       {
         framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
         scale_tile.Resize(x->dims());
@@ -1031,7 +1029,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
       // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean),
       // axis) / (var + epsilon))
       // scale * tmp3
-      Tensor dx_1;
+      phi::DenseTensor dx_1;
       {
         dx_1.Resize(x->dims());
         dx_1.mutable_data<float>(place);
@@ -1052,7 +1050,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
 
     // cacl d_scale
     if (d_scale) {
-      Tensor d_scale_2;
+      phi::DenseTensor d_scale_2;
       {
         d_scale_2.Resize(x->dims());
         d_scale_2.mutable_data<float>(place);
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
index d4f06e6446887..3eed4989bb7ea 100644
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ b/paddle/fluid/operators/take_along_axis_op_npu.cc
@@ -22,8 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 3f781ab65eeb8..b41453b849bc4 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -28,7 +28,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
 
diff --git a/paddle/fluid/operators/tdm_sampler_op.h b/paddle/fluid/operators/tdm_sampler_op.h
index d98680c574154..1ba0e2c66be8d 100644
--- a/paddle/fluid/operators/tdm_sampler_op.h
+++ b/paddle/fluid/operators/tdm_sampler_op.h
@@ -29,7 +29,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using Sampler = math::Sampler;
 using DDim = framework::DDim;
 using LoD = framework::LoD;
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index f880181662e24..bad4479868053 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -201,15 +199,17 @@ class TeacherStudentSigmoidLossOpMaker
  public:
   void Make() override {
     AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x 1],"
+             "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
+             "tensor with shape [N x 1],"
              " where N is the batch size and D is the output. "
              "This input is a probability computed by the previous operator, "
              "which is almost always the result of a softmax operator.");
     AddInput("Label",
-             "(Tensor), the ground truth which is a 2-D tensor. "
-             "Label is a Tensor<float> with shape [N x 1]. ");
+             "(phi::DenseTensor), the ground truth which is a 2-D tensor. "
+             "Label is a phi::DenseTensor<float> with shape [N x 1]. ");
     AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "(phi::DenseTensor, default phi::DenseTensor<float>), a 2-D "
+              "tensor with shape "
               "[N x 1]. The teacher student sigmoid loss.");
     AddAttr<float>(
         "soft_max_up_bound",
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
index 40bac8c364583..133d9656284f3 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class TeacherStudentSigmoidLossOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 5ea2ead118892..ec2533316e107 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -16,7 +16,6 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
 
 template <typename T>
diff --git a/paddle/fluid/operators/tile_op_mlu.cc b/paddle/fluid/operators/tile_op_mlu.cc
index 2b2b3df4431f1..3660627b8b578 100644
--- a/paddle/fluid/operators/tile_op_mlu.cc
+++ b/paddle/fluid/operators/tile_op_mlu.cc
@@ -18,8 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class TileMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
index 2997052257d18..4ae1f6cbed330 100644
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
 
 template <typename T>
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index ab90fa78d3d45..f1674bc5005a0 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -30,8 +30,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
   case (dim): {                        \
     constexpr auto kBlockDim = (dim);  \
@@ -74,7 +72,7 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 
     auto* k_t = ctx.Input<phi::DenseTensor>("K");
     if (k_t) {
-      Tensor k_host;
+      phi::DenseTensor k_host;
       framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host);
       k = k_host.data<int>()[0];
       framework::DDim output_dims = output->dims();
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index cd29137d530f4..27f246415a94c 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class TopkKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index cbe5c224ae1d3..5b9b507989952 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -65,7 +65,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
                                              {"dim", -1},
                                              {"largest", true}};
 
-    Tensor tmp_indices(experimental::DataType::INT32);
+    phi::DenseTensor tmp_indices(experimental::DataType::INT32);
     tmp_indices.Resize(indices->dims());
     tmp_indices.mutable_data<int>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc
index 25f3faa38a0c5..df1725265ebde 100644
--- a/paddle/fluid/operators/top_k_op_xpu.cc
+++ b/paddle/fluid/operators/top_k_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 template <typename T>
 class TopkXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index ee37c2e9fe09b..cab0796a71019 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -22,7 +22,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 using DDim = framework::DDim;
 template <typename DeviceContext, typename T>
 class TreeConvKernel : public framework::OpKernel<T> {
@@ -40,7 +39,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
-    Tensor W;
+    phi::DenseTensor W;
     W.ShareDataWith(*Filter);
     W.Resize(phi::flatten_to_2d(Filter->dims(), 2));
 
@@ -67,7 +66,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
       auto embeddings =
           Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
       auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
-      Tensor patch;
+      phi::DenseTensor patch;
       tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
       constant(dev_ctx, &out_vec, 0);
       blas.MatMul(patch, W, &out_vec);
@@ -93,7 +92,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
 
-    Tensor W;
+    phi::DenseTensor W;
     W.ShareDataWith(*Filter);
     W.Resize(phi::flatten_to_2d(Filter->dims(), 1));
 
@@ -110,7 +109,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     out_grad_dims = phi::flatten_to_2d(out_grad_dims, 1);
     if (filter_g) {
       filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
-      Tensor f_g;
+      phi::DenseTensor f_g;
       f_g.ShareDataWith(*filter_g);
       f_g.Resize(phi::flatten_to_2d(Filter->dims(), 2));
       constant(dev_ctx, filter_g, 0);
@@ -121,7 +120,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
                               .Resize(embedding_slicedim);
         auto out_grad =
             out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        Tensor patch;
+        phi::DenseTensor patch;
         tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
         blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
       }
@@ -138,7 +137,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
             out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
         auto in_grad =
             in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
-        Tensor in_grad_temp;
+        phi::DenseTensor in_grad_temp;
         col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
         blas.MatMul(in_grad_temp, false, W, true, &in_grad);
       }
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index b5e67ccb24a9a..c3b2e24892e40 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -22,37 +22,35 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename DeviceContext, typename T>
 class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
     std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
-    Tensor shape_tensor(experimental::DataType::INT32);
+    phi::DenseTensor shape_tensor(experimental::DataType::INT32);
     shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
                                        ctx.GetPlace());
     paddle::framework::TensorFromVector(
         shape, ctx.device_context(), &shape_tensor);
     float mean = ctx.Attr<float>("mean");
-    Tensor mean_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor mean_tensor(experimental::DataType::FLOAT32);
     mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<float>(&mean_tensor, mean);
 
     float std = ctx.Attr<float>("std");
-    Tensor std_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor std_tensor(experimental::DataType::FLOAT32);
     std_tensor.mutable_data<float>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<float>(&std_tensor, std);
 
     int32_t seed_var = ctx.Attr<int32_t>("seed");
 
-    Tensor min_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor min_tensor(experimental::DataType::FLOAT32);
     min_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float min_value = mean - std * 2.0;
     FillNpuTensorWithConstant<float>(&min_tensor, min_value);
 
-    Tensor max_tensor(experimental::DataType::FLOAT32);
+    phi::DenseTensor max_tensor(experimental::DataType::FLOAT32);
     max_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float max_value = mean + std * 2.0;
     FillNpuTensorWithConstant<float>(&max_tensor, max_value);
@@ -83,7 +81,7 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<phi::DenseTensor>("Out");
     tensor->mutable_data<T>(context.GetPlace());
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
     std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 0da82f73028d9..7ba22baff99b9 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -145,7 +145,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
-      const Tensor &tensor,
+      const phi::DenseTensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
     if (var_name == "ShapeTensorList" || var_name == "ShapeTensor") {
       return expected_kernel_type;
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 8dd7a140ae914..4c60cb76fb9ea 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -46,7 +46,8 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 3ddf6092f04bf..05a643b33b215 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -30,7 +30,6 @@
 
 namespace paddle {
 namespace operators {
-using Tensor = phi::DenseTensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
     const phi::DenseTensor* new_data_tensor) {
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 1f7f3e2f2bad3..8e5f61c831088 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -50,7 +50,8 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
@@ -59,7 +60,7 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
 
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
index b1499b30fede7..e82c6e1f2a91a 100644
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
@@ -52,7 +52,8 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
       if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "Expected type of Output(out) in uniform_random_op must be "
+          "phi::DenseTensor, "
           "SelectedRows. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
@@ -60,7 +61,7 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
 
-    Tensor cpu_tensor(tensor->dtype());
+    phi::DenseTensor cpu_tensor(tensor->dtype());
     cpu_tensor.Resize(tensor->dims());
     T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
 
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 35118ae64876c..b470874f26083 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -24,7 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 void VarConv2dOpMaker::Make() {
@@ -36,7 +35,7 @@ void VarConv2dOpMaker::Make() {
            "(phi::DenseTensor) the row variable provides lod information");
   AddInput("COLUMN",
            "(phi::DenseTensor) the column variable provides lod information");
-  AddInput("W", "W (Tensor), the filter.");
+  AddInput("W", "W (phi::DenseTensor), the filter.");
   AddAttr<int>("InputChannel", "the input filter num").SetDefault(1);
   AddAttr<int>("OutputChannel", "the output filter num").SetDefault(1);
   AddAttr<int>("StrideH", "the height of Stride").SetDefault(1);
@@ -130,11 +129,11 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* x_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("X")[0]);
     const auto& x_lod = x_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(
-        !x_lod.empty(),
-        true,
-        platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
-                                          "does not contain LoD information."));
+    PADDLE_ENFORCE_EQ(!x_lod.empty(),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The Input(X) phi::DenseTensor of VarConv2dOP "
+                          "does not contain LoD information."));
 
     PADDLE_ENFORCE_GE(x_lod.size(),
                       1,
@@ -151,20 +150,22 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
     framework::Variable* row_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
     const auto& row_lod = row_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(!row_lod.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Input(ROW) Tensor of VarConv2dOP does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        !row_lod.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "The Input(ROW) phi::DenseTensor of VarConv2dOP does not "
+            "contain LoD information."));
 
     framework::Variable* col_var =
         PADDLE_GET(framework::Variable*, ctx->GetInputVarPtrs("COLUMN")[0]);
     const auto& col_lod = col_var->Get<phi::DenseTensor>().lod();
-    PADDLE_ENFORCE_EQ(!col_lod.empty(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "The Input(COLUMN) Tensor of VarConv2dOP does not "
-                          "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        !col_lod.empty(),
+        true,
+        platform::errors::InvalidArgument(
+            "The Input(COLUMN) phi::DenseTensor of VarConv2dOP does not "
+            "contain LoD information."));
   } else {
     std::vector<int64_t> out_dims_vec{-1};
     out_dims_vec.push_back(1);
@@ -468,7 +469,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel<T> {
     auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* d_w = ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
 
-    Tensor col_grad;
+    phi::DenseTensor col_grad;
     col_grad.Resize(col->dims());
     auto* col_diff = col_grad.mutable_data<T>(ctx.GetPlace());
     auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h
index 1a5fa9de2c7ce..cc0c97e671e8a 100644
--- a/paddle/fluid/operators/var_conv_2d_op.h
+++ b/paddle/fluid/operators/var_conv_2d_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
 using LoD = framework::LoD;
 
 class VarConv2dOP : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/where_index_op_mlu.cc b/paddle/fluid/operators/where_index_op_mlu.cc
index 85f463f723ef5..59ffb43f7ce5c 100644
--- a/paddle/fluid/operators/where_index_op_mlu.cc
+++ b/paddle/fluid/operators/where_index_op_mlu.cc
@@ -20,8 +20,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class MLUWhereIndexKernel : public framework::OpKernel<T> {
  public:
@@ -31,7 +29,7 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
     auto dims = condition->dims();
     const int rank = dims.size();
 
-    Tensor num_true;
+    phi::DenseTensor num_true;
     num_true.mutable_data<int>({1}, context.GetPlace());
     MLUCnnlTensorDesc con_desc(*condition);
     MLUCnnlTensorDesc num_true_desc(num_true);
@@ -41,7 +39,7 @@ class MLUWhereIndexKernel : public framework::OpKernel<T> {
                      num_true_desc.get(),
                      GetBasePtr(&num_true));
 
-    Tensor local_true_num;
+    phi::DenseTensor local_true_num;
     paddle::framework::TensorCopySync(
         num_true, platform::CPUPlace(), &local_true_num);
     auto true_num = *local_true_num.data<int>();
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
index 5b006cbdcf1b0..d888513c2ebd2 100644
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ b/paddle/fluid/operators/where_index_op_npu.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = phi::DenseTensor;
-
 template <typename T>
 class NPUWhereIndexKernel : public framework::OpKernel<T> {
  public:
@@ -39,7 +37,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
     const aclrtStream& stream = dev_ctx.stream();
 
     // Run Cast and ReduceSum to get 0 dim of Out
-    Tensor booled_cond;
+    phi::DenseTensor booled_cond;
     if (framework::TransToProtoVarType(condition->dtype()) !=
         framework::proto::VarType::BOOL) {
       auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL);
@@ -53,7 +51,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
     } else {
       booled_cond.ShareDataWith(*condition);
     }
-    Tensor casted_cond;
+    phi::DenseTensor casted_cond;
     auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64);
     casted_cond.mutable_data<int64_t>(dims, place);
     const auto& cast_runner =
@@ -63,9 +61,9 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     cast_runner.Run(stream);
 
-    Tensor sumed_true_num;
+    phi::DenseTensor sumed_true_num;
     sumed_true_num.mutable_data<int64_t>({1}, place);
-    Tensor cond_axes;
+    phi::DenseTensor cond_axes;
     cond_axes.mutable_data<int>({dims.size()}, place);
     std::vector<int> axes_vec;
     for (int i = 0; i < dims.size(); ++i) {
@@ -78,7 +76,7 @@ class NPUWhereIndexKernel : public framework::OpKernel<T> {
                                          {{"keep_dims", false}});
     sum_runner.Run(stream);
 
-    Tensor local_true_num;
+    phi::DenseTensor local_true_num;
     paddle::framework::TensorCopySync(
         sumed_true_num, platform::CPUPlace(), &local_true_num);
     auto true_num = *local_true_num.data<int64_t>();